user12398760
user12398760

Reputation: 1

How can I calculate the distance of a state within a cluster from the center of the cluster?

I have a sample of 28 states. I want to plot them in one cluster, identify the center, and then calculate the distance of every state from the center, per year.

my input file resemble the following: first column: Country second column: Year (from 2008 to 2017) third column: PI (index)

Question 1: I am getting the error: Error in eval(e, x, parent.frame()) : object 'mydata.year' not found when I run: table_2008 = subset(table1, mydata.year ==2008)

Question 2: Which code is best suited to calculate the distance of a state from the center of the cluster.

Please find my code below. I hope someone can help.

Thank you.

Code:

heisenberg <- read.csv(file="C:/Users/TA/Desktop/R4./PI4.csv",head=TRUE,sep=",") rm(list=ls())

mydata = read.csv("C:/Users/TA/Desktop/R4./PI4.csv",sep = ",", header=TRUE)

mydata$Country
mydata$Category
mydata$PI

data_cluster = data.frame(mydata$Country,mydata$Category,mydata$PI)

write.csv(data_cluster,"C:/Users/TA/Desktop/R4./OutputPI.csv", row.names = FALSE)


table1 = data_cluster



#plot(uk_line[,4])
table1 = na.omit(table1)

within_results = ts(,start = c(2008), end = c(2017), frequency = 1)
within_resultsbetweenss = ts(,start = c(2008), end = c(2017), frequency = 1)
within_results_withinss = matrix(data= NA, nrow = 10, ncol = 4) 
#nrow = years, ncols = number of clusters

#colnames(mydata, c("Country","Year"))

#YEAR 2008
#SELECTING A GIVEN YEAR (subset of rows such that year = 2008)
table_2008 = subset(table1, mydata.year ==2008)
table_2008


data2008_clus = table_2008[,3:ncol(table_2008)]

#NAMING THE ROWS USING THE COUNTRY NAMES
rownames(data2008_clus) = table_2008$mydata.Country

data2008_clus


plot(table_2008)

wss <- (nrow(data2008_clus)-1)*sum(apply(data2008_clus,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data2008_clus,
                                     centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")




# Compute k-means with k = 1


fit1=kmeans(x = data2008_clus,centers = 1)
fit1$cluster
fviz_cluster(fit1,data = data2008_clus)
fit1$withinss
fit1$totss
fit1$betweenss
table_2008$cluster = factor(fit1$cluster)
centers=as.data.frame(fit1$centers)
table_2008

within_results[1] = fit1$totss
within_resultsbetweenss[1] = fit1$betweenss
within_results_withinss[1,] = fit1$withinss
within_results_withinss[1,] =  fit1$withinss

plot(within_results)
plot(within_resultsbetweenss)
plot(within_results_withinss)

# Print the results 
print(km.res)
table_2008

mydata_struct = structure( list( Year = c(2008L, 2008L, 2008L, 2008L, 2008L, 2008L), Country = structure( 1:6, .Label = c( "Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta", "Netherlands", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Spain", "Sweden", "United Kingdom" ), class = "factor" ), Prosperity.Index = c(79.4, 76.1, 62, 65.1, 69.9, 70.9) ), row.names = c(NA, 6L), class = "data.frame" )

Upvotes: 0

Views: 150

Answers (1)

dcarlson
dcarlson

Reputation: 11066

We can generate random values for the rest of the years. You do not need to do this since your data is complete. I'm just trying to create data that resembles yours:

mydata_struct = structure( list( Year = c( 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L,
     2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L,
     2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2009L ),
     Country = structure( c( 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
     15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 1L ),
     .Label = c( "Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark",
     "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Ireland", "Italy",
     "Latvia", "Lithuania", "Luxembourg", "Malta", "Netherlands", "Poland", "Portugal",
     "Romania", "Slovakia", "Slovenia", "Spain", "Sweden", "United Kingdom" ),
     class = "factor" ), Prosperity.Index = c( 79.4, 76.1, 62, 65.1, 69.9, 70.9, 83.2, 73.5,
     81.2, 75.9, 79.9, 66, 66.7, 78.9, 69.6, 67.7, 66.6, 79.9, 73.4, 81.2, 66.9, 71, 62.6,
     68.2, 72.7, 72.6, 82.8, 78, 79.4 ) ), row.names = c(NA, 29L), class = "data.frame" )

Now we create data for the other years and a data frame Prosperity by copying the data for the first year:

names <- rep(mydata_struct$Country[1:28], 10)
years <- rep(2008:2017, each=28)
prosp <- rep(mydata_struct$Prosperity.Index[1:28], 10)
Prosperity <- data.frame(Country=names, Year=years, PI=prosp)

Now we will fuzz the other years and add a trend toward increasing prosperity:

set.seed(42)
Prosperity$PI <- rnorm(280, prosp, rnorm(280, 2, .25)) + (years - years[1]) * rnorm(280, 1, .25)

Here is where you start with your actual data. First we can get some statistics:

options(digits=4)
with(Prosperity, tapply(PI, Year, mean))   # Mean PI for each year
#  2008  2009  2010  2011  2012  2013  2014  2015  2016  2017 
# 73.02 73.74 74.50 76.50 76.13 77.95 78.33 79.55 80.85 81.71 
with(Prosperity, tapply(PI, Year, sd))     # Standard deviation for PI for each year
#  2008  2009  2010  2011  2012  2013  2014  2015  2016  2017 
# 6.861 6.422 6.840 6.935 6.582 6.592 8.331 6.777 7.489 8.044 
with(Prosperity, tapply(PI, Year, max) - tapply(PI, Year, min))  # Range in PI for each year
#  2008  2009  2010  2011  2012  2013  2014  2015  2016  2017 
# 28.24 23.24 22.20 24.83 23.99 23.26 27.97 22.77 27.78 30.66 

Finally some plots:

plot(PI~Year, Prosperity)   # Plot all values
boxplot(PI~Year, Prosperity)   # Boxplots

Scatterplot

Boxplots

Upvotes: 0

Related Questions