# Использование метода К-средних

library(ggplot2)
library(scales)   # Need the scales package

# Пример с одномерным распределением

set.seed(123)

data <- round(c(rnorm(5000, 5000, 400), 
                rnorm(2000, 7000, 500)),0)

dd <- with(density(data),data.frame(x,y))

qplot(x,y,data=dd,geom="line")+
  theme_bw() + xlab("Зарплата") + 
  ylab("%") + scale_y_continuous(labels=percent)

qplot(x,y,data=dd,geom="line")+
  geom_ribbon(data=subset(dd,x>mean(data)),aes(ymax=y),ymin=0, fill="red",alpha=0.5) +
  geom_ribbon(data=subset(dd,x<mean(data)),aes(ymax=y),ymin=0, fill="blue",alpha=0.5) +
  theme_bw() + xlab("Зарплата (распределение по среднему)") + ylab("%") + scale_y_continuous(labels=percent)

qplot(x,y,data=dd,geom="line")+
  geom_ribbon(data=subset(dd,x>median(data)),aes(ymax=y),ymin=0, fill="red",alpha=0.5) +
  geom_ribbon(data=subset(dd,x<median(data)),aes(ymax=y),ymin=0, fill="blue",alpha=0.5) +
  theme_bw() + xlab("Зарплата (распределение по медиане)") + ylab("%") + scale_y_continuous(labels=percent)

set.seed(123)
center = kmeans(data, centers=2)

qplot(x,y,data=dd,geom="line")+
  geom_ribbon(data=subset(dd,x>min(data[center$cluster == 2])),aes(ymax=y),ymin=0, fill="red",colour=NA,alpha=0.5) +
  geom_ribbon(data=subset(dd,x<max(data[center$cluster == 1])),aes(ymax=y),ymin=0, fill="blue",colour=NA,alpha=0.5) +
  theme_bw() + xlab("Зарплата (распределение по кластерам)") + ylab("%") + scale_y_continuous(labels=percent)

# Приклад з невдалими кластерами
center = kmeans(matrix(data), centers=matrix(sort(data)[c(1,2000)]), iter.max = 1, algorithm = "Forgy")

qplot(x,y,data=dd,geom="line")+
  geom_ribbon(data=subset(dd,x>min(data[center$cluster == 2])),aes(ymax=y),ymin=0, fill="red",colour=NA,alpha=0.5) +
  geom_ribbon(data=subset(dd,x<max(data[center$cluster == 1])),aes(ymax=y),ymin=0, fill="blue",colour=NA,alpha=0.5) + 
  scale_y_continuous(labels=percent)

# Круглые кластеры

r <- jitter(rep(4,500),4)
degs <- 360*runif(500)
theta <- jitter(2*pi*degs/360)
plot(r*sin(theta),r*cos(theta))
data1 <- data.frame(x=r*sin(theta),y = r*cos(theta))

r <- jitter(rep(2,500),2)
data2 <- data.frame(x=r*sin(theta)+4.5,y = r*cos(theta))

data <- as.data.frame(rbind(data1, data2))
clusters <- kmeans(data, centers = 2)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color=factor(clusters$cluster)), alpha = 0.5) +
  geom_point(data = as.data.frame(clusters$centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:2))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

# Отсутствие кластеров в наборе

data <- data.frame(x = rnorm(1000), y = rnorm(1000))
clusters <- kmeans(data, centers = 2)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color=factor(clusters$cluster)), alpha = 0.5) +
  geom_point(data = as.data.frame(clusters$centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:2))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

# Генерация данных на 5 кластеров

set.seed(1)
k=5; n=50
data <- data.frame(x = rnorm(k,50,2), y = rnorm(k,50,3))

for (row in 1:nrow(data)) {
  data <- rbind(data,data.frame(x = rnorm(n,data[row,]$x,0.3), y = rnorm(n, data[row,]$y,0.3)))
}

ggplot(data, aes(x = x, y = y)) + geom_point(color='#777777') +
  theme_bw()

# Два решения с разными исходными центрами

set.seed(1)
clusters <- kmeans(data, centers = 5)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color=factor(clusters$cluster)), alpha = 0.5) +
  geom_point(data = as.data.frame(clusters$centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

set.seed(2)
clusters <- kmeans(data, centers = 5)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color=factor(clusters$cluster)), alpha = 0.5) +
  geom_point(data = as.data.frame(clusters$centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

# Пошаговое ручное решение

ggplot(data, aes(x = x, y = y)) + geom_point(color='#777777') +
  theme_bw()

set.seed(123)
init_centers = sample(1:nrow(data), size=5)

ggplot(data, aes(x = x, y = y)) + geom_point(color='#777777', alpha = 0.5) +
  geom_point(data = as.data.frame(data[init_centers,]), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

clusters <- apply(as.matrix(dist(data))[,init_centers], 1, which.min)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color=factor(clusters)), alpha = 0.5) +
  geom_point(data = as.data.frame(data[init_centers,]), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

new_centers <- aggregate(data, list(clusters), mean)[c("x", "y")]

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color = factor(clusters)), alpha = 0.5) +
  geom_point(data = as.data.frame(new_centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

clusters <- apply(as.matrix(dist(rbind(data,new_centers)))[1:nrow(data),(nrow(data)+1):(nrow(data)+5)], 1, which.min)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color = factor(clusters)), alpha = 0.5) +
  geom_point(data = as.data.frame(new_centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

new_centers <- aggregate(data, list(clusters), mean)[c("x", "y")]

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color = factor(clusters)), alpha = 0.5) +
  geom_point(data = as.data.frame(new_centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

clusters <- apply(as.matrix(dist(rbind(data,new_centers)))[1:nrow(data),(nrow(data)+1):(nrow(data)+5)], 1, which.min)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color = factor(clusters)), alpha = 0.5) +
  geom_point(data = as.data.frame(new_centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

new_centers <- aggregate(data, list(clusters), mean)[c("x", "y")]

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color = factor(clusters)), alpha = 0.5) +
  geom_point(data = as.data.frame(new_centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:5))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

# Критерий "каменистой осыпи"

wss <- (nrow(data)-1)*sum(apply(data,2,var))
set.seed(1)
for (i in 2:15) 
  wss[i] <- sum(kmeans(data, centers=i)$withinss)

ggplot(data.frame(wss = wss, index = 1:length(wss)), aes(x=index, y=wss, group=1)) +
  geom_line()+
  geom_point() + theme_bw()

# Проблема с разными единицами измерения

set.seed(1)
data1 <- data.frame(x = rnorm(1000), y = rnorm(1000)*5)
data2 <- data.frame(x = rnorm(1000) + 5, y = rnorm(1000)*5)
data <- as.data.frame(rbind(data1,data2))

set.seed(1)
clusters <- kmeans(data, centers = 2)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color=factor(clusters$cluster)), alpha = 0.5) +
  geom_point(data = as.data.frame(clusters$centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:2)), size = 10)) +
  scale_shape_identity() + xlim(-20, 20) + ylim(-20, 20) +
  theme_bw() + theme(legend.position="none")

set.seed(1)
clusters <- kmeans(scale(data), centers = 2)

ggplot(data.frame(scale(data)), aes(x = x, y = y)) + geom_point(aes(color=factor(clusters$cluster)), alpha = 0.5) +
  geom_point(data = as.data.frame(clusters$centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:2)), size = 10)) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

# Отсутствие кластеров в наборе и критерий различия

set.seed(1)
data <- data.frame(x = rnorm(1000), y = rnorm(1000))
clusters <- kmeans(data, centers = 2)

ggplot(data, aes(x = x, y = y)) + geom_point(aes(color=factor(clusters$cluster)), alpha = 0.5) +
  geom_point(data = as.data.frame(clusters$centers), aes(x = x, y = y, shape = "❌", color = factor(c(1:2))), size = 10) +
  scale_shape_identity() +
  theme_bw() + theme(legend.position="none")

set.seed(1)
data$cluster <- clusters$cluster

ggplot(data, aes(x = cluster, y = x, group = cluster)) +
  geom_boxplot(aes(fill = factor(data$cluster)), color="black")+
  theme_bw() + theme(legend.position="none") +
  xlab(paste("T-test score is",round(t.test(data$x~data$cluster)$statistic,4)))

ggplot(data, aes(x = cluster, y = y, group = cluster)) +
  geom_boxplot(aes(fill = factor(data$cluster)), color="black")+
  theme_bw() + theme(legend.position="none") +
  xlab(paste("T-test score is",round(t.test(data$y~data$cluster)$statistic,4)))

wss <- (nrow(data)-1)*sum(apply(data,2,var))
set.seed(1)
for (i in 2:15) 
  wss[i] <- sum(kmeans(data, centers=i)$withinss)

library(reshape2)
means<-melt(clusters$centers)

ggplot(means, aes(x=Var2, y=value, group=Var2, color = factor(Var1))) +
  geom_point() + geom_line(data=means, aes(x = Var1, y = value[c(2,4,1,3)], color = c("2","2","1","1"))) +
  xlab("Переменные") + ylab("Значение среднего") +
  theme_bw() + theme(legend.position="none")
