Jonathan Zimmermann
Jonathan Zimmermann

Reputation: 91

as.matrix() and as.dist() have different results

I have a list "simil", which contains 7 vectors:

 > dput(simil)
structure(list(Monday = structure(c(0.889987253484581, 0.882957894295089, 
0.882232353177177, 0.874080268021168, 0.851760771472629, 0.811536071048775
), .Names = c("Sunday", "Tuesday", "Friday", "Wednesday", "Thursday", 
"Saturday")), Tuesday = structure(c(0.901682757072732, 0.882957894295089, 
0.874716806575548, 0.869202937572079, 0.855248496101086, 0.818659253763272
), .Names = c("Sunday", "Monday", "Wednesday", "Friday", "Thursday", 
"Saturday")), Wednesday = structure(c(0.88354911311872, 0.874716806575548, 
0.874080268021168, 0.853293126413937, 0.851921112754124, 0.841170795359615
), .Names = c("Sunday", "Tuesday", "Monday", "Friday", "Thursday", 
"Saturday")), Thursday = structure(c(0.86579834238668, 0.855248496101086, 
0.851921112754124, 0.851760771472629, 0.851384896045153, 0.836732564057725
), .Names = c("Sunday", "Tuesday", "Wednesday", "Monday", "Friday", 
"Saturday")), Friday = structure(c(0.882232353177177, 0.869202937572079, 
0.856441568566172, 0.853293126413937, 0.851384896045153, 0.80098779448239
), .Names = c("Monday", "Tuesday", "Sunday", "Wednesday", "Thursday", 
"Saturday")), Saturday = structure(c(0.866654844262859, 0.841170795359615, 
0.836732564057725, 0.818659253763272, 0.811536071048775, 0.80098779448239
), .Names = c("Sunday", "Wednesday", "Thursday", "Tuesday", "Monday", 
"Friday")), Sunday = structure(c(0.901682757072732, 0.889987253484581, 
0.88354911311872, 0.866654844262859, 0.86579834238668, 0.856441568566172
), .Names = c("Tuesday", "Monday", "Wednesday", "Saturday", "Thursday", 
"Friday"))), .Names = c("Monday", "Tuesday", "Wednesday", "Thursday", 
"Friday", "Saturday", "Sunday"), class = c("similMatrix", "list"
))

I now want to transform it into a dist object to then use it for hclust(). So I use as.dist() and I compute:

> as.dist(simil,diag = TRUE, upper = TRUE)
             Monday    Sunday   Tuesday    Friday Wednesday  Thursday  Saturday
Monday    0.0000000 0.8899873 0.8829579 0.8822324 0.8740803 0.8517608 0.8115361
Sunday    0.8899873 0.0000000 1.0000000 0.8692029 0.8747168 0.8552485 0.8186593
Tuesday   0.8829579 1.0000000 0.0000000 0.8532931 1.0000000 0.8519211 0.8411708
Friday    0.8822324 0.8692029 0.8532931 0.0000000 0.8519211 1.0000000 0.8367326
Wednesday 0.8740803 0.8747168 1.0000000 0.8519211 0.0000000 0.8513849 0.8009878
Thursday  0.8517608 0.8552485 0.8519211 1.0000000 0.8513849 0.0000000 1.0000000
Saturday  0.8115361 0.8186593 0.8411708 0.8367326 0.8009878 1.0000000 0.0000000

But this is a slightly different result from when I use as.matrix():

> as.matrix(simil)
             Monday   Tuesday Wednesday  Thursday    Friday  Saturday    Sunday
Monday    1.0000000 0.8829579 0.8740803 0.8517608 0.8822324 0.8115361 0.8899873
Sunday    0.8899873 0.9016828 0.8835491 0.8657983 0.8564416 0.8666548 1.0000000
Tuesday   0.8829579 1.0000000 0.8747168 0.8552485 0.8692029 0.8186593 0.9016828
Friday    0.8822324 0.8692029 0.8532931 0.8513849 1.0000000 0.8009878 0.8564416
Wednesday 0.8740803 0.8747168 1.0000000 0.8519211 0.8532931 0.8411708 0.8835491
Thursday  0.8517608 0.8552485 0.8519211 1.0000000 0.8513849 0.8367326 0.8657983
Saturday  0.8115361 0.8186593 0.8411708 0.8367326 0.8009878 1.0000000 0.8666548

With as.dist(), the matrix is not entirely symmetric and some pairs become wrong, which doesn't happen with as.matrix(). Why is that? How can I correct it?

Upvotes: 1

Views: 1087

Answers (1)

Jonathan Zimmermann
Jonathan Zimmermann

Reputation: 91

So in the end I managed to fix it by first transforming into a matrix, then swaping the row order, and finally changing into a dist object:

simil = as.matrix(simil)
simil = simil[ c(1,3,5,6,4,7,2),]
simil = as.dist(1-simil,diag = TRUE, upper = TRUE)

> simil
              Monday    Tuesday  Wednesday   Thursday     Friday   Saturday     Sunday
Monday    0.00000000 0.11704211 0.12591973 0.14823923 0.11776765 0.18846393 0.11001275
Tuesday   0.11704211 0.00000000 0.12528319 0.14475150 0.13079706 0.18134075 0.09831724
Wednesday 0.12591973 0.12528319 0.00000000 0.14807889 0.14670687 0.15882920 0.11645089
Thursday  0.14823923 0.14475150 0.14807889 0.00000000 0.14861510 0.16326744 0.13420166
Friday    0.11776765 0.13079706 0.14670687 0.14861510 0.00000000 0.19901221 0.14355843
Saturday  0.18846393 0.18134075 0.15882920 0.16326744 0.19901221 0.00000000 0.13334516
Sunday    0.11001275 0.09831724 0.11645089 0.13420166 0.14355843 0.13334516 0.00000000

It might be due to the fact that "simil" was created from the similarity() function of the quanteda package.

Upvotes: 1

Related Questions