Plotting the distribution for multiple columns

Question

I would like to plot the distribution of multiple columns of my data set. It has over 820.000 rows and 18 columns. I want to plot all columns except the columns with the dummy variables. I have already been able to create a graphic. But I want to have the values of the x-axis on the y-axis because these are the column values and I want to display their distribution for each column.

1. Definition of the path

setwd("C:/Users/A/Documents/Master BWL/Masterarbeit")

2. Loading the required packages

library(factoextra); library(cluster); library(skmeans); library(mclust); 
library(fpc); library(psda); library(simEd); library (ggpubr);
library(dbscan); library(clustertend); library(MASS); library(devtools);
library(ggbiplot);library(NbClust); library(clValid); library(plotrix)
library(graphics); library(reshape2)

3. Import csv file

WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";",  stringsAsFactors = FALSE)

4 Select columns

WKA_ohneJB2 <- c(WKA_ohneJB[, "BASKETS_NZ"], WKA_ohneJB[, "PIS"],  WKA_ohneJB[, "PIS_AP"],
             WKA_ohneJB[, "PIS_DV"], WKA_ohneJB[, "PIS_PL"], WKA_ohneJB [, "PIS_SDV"],
            WKA_ohneJB[, "PIS_SHOPS"], WKA_ohneJB[,"PIS_SR"], WKA_ohneJB[, "QUANTITY"]

)

df <- melt(WKA_ohneJB2)

5 Plot

ggplot(df) + 
geom_col(aes(x= WKA_ohneJB2 , y=value))

This is the plot I have generated so far.

Here is a part of my dataset:

dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L, 
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L, 
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L, 
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L, 
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L), 
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L, 
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L, 
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L, 
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L, 
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L, 
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L, 
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L, 
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L, 
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L, 
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L, 
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L, 
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), 
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L, 
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L

), class = "data.frame")

6 Plotting histogram

var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL","PIS_SDV", "PIS_SHOPS","PIS_SR", "QUANTITY") par(mfrow=c(3,3)) for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}

I have created several histograms. But the scaling of the axes is wrong. I want the numerical values of the x axis to appear on the y axis and the numerical values of the y axis to appear on the x axis. How does this work? I also want the values to be displayed completely and not as e^.

StupidWolf · Accepted Answer

You don't need to combine your dataframe all over again. What you need is either a density plot or histogram.

Also as good practice, load only the packages required for plotting, in this case it would be maybe ggplot2 and tidyr.

For example, I just used an example with 5 of the column names I can see in your data:

library(tidyr)
library(ggplot2)

WKA_ohneJB = data.frame(dummyvar=1:10000,sapply(1:5,rnorm,n=10000))
colnames(WKA_ohneJB)[-1] = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
head(WKA_ohneJB)

  dummyvar  BASKETS_NZ       PIS   PIS_AP   PIS_DV   PIS_PL
1        1  0.92088518 0.9167877 1.956920 4.695379 4.349631
2        2  0.05335686 2.8225161 3.059749 4.317281 5.985579
3        3  1.00141759 3.5743033 2.499662 4.761415 5.886588
4        4 -1.31231486 2.5335004 5.396917 4.364643 5.866026
5        5 -0.65336724 0.2647117 3.203358 4.838659 4.437011
6        6  0.78769080 0.3630670 2.516433 3.826074 3.741611

To one of them do:

ggplot(WKA_ohneJB,aes(x=PIS)) + geom_histogram()

Or:

ggplot(WKA_ohneJB,aes(x=PIS)) + geom_density()

To plot everything at one go, you can try to pivot it long, as you have done with melt, but I don't know if your machine can handle it, so try it for a few variables first:

var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
dummyvar = "dummyvar"
ggplot(pivot_longer(WKA_ohneJB[,c(var_to_plot,dummyvar)],-dummyvar),
aes(x=value)) +
geom_histogram() +
facet_wrap(~name)

If melting the data.frame is too intensive, just use baseR plot:

# means 2 rows, 3 columns
par(mfrow=c(2,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}