Manps
Manps

Reputation: 11

How do I fit a Gaussian curve to this data?

I am trying to get a curve to fit to this scatter data that gives me a Gaussian curve:

library(tidyverse)
MK20 <- tribble(~X.Intensity,    ~Average,
             0.400,  0.0000000,
             0.463,  0.0000000,
             0.536,  0.000000,
             0.621,  0.0000000,
             0.719,  0.0000000,
             0.833,  0.0000000,
             0.965,  0.0000000,
             1.120,  0.0000000,
             1.290,  0.0000000,
             1.500,  0.0000000,
             1.740,  0.0000000,
             2.010,  0.0000000,
             2.330,  0.0000000,
             2.700,  0.0000000,
             3.120,  0.0000000,
             3.620,  0.0000000,
             4.190,  0.0000000,
             4.850,  0.0000000,
             5.610,  0.0000000,
             6.500,  0.0000000,
             7.530,  0.0000000,
             8.720,  0.0000000,
             10.100,  0.0000000,
             11.700,  0.0000000,
             13.500,  0.0000000,
             15.700,  0.0000000,
             18.200,  0.0000000,
             21.000,  0.0000000,
             24.400,  0.0000000,
             28.200,  0.0000000,
             32.700,  0.0000000,
             37.800,  0.0000000,
             43.800,  0.7023333,
             50.700,  3.3700000,
             58.800,  7.3933333,
             68.100, 11.4666667,
             78.800, 14.3666667,
             91.300, 15.4000000,
             106.000, 14.5000000,
             122.000, 12.0000000,
             142.000,  8.6433333,
             164.000,  5.2200000,
             190.000,  2.4500000,
             220.000,  0.7580000,
             255.000,  0.1306667,
             295.000,  0.0000000,
             342.000,  0.0000000,
             396.000,  0.0000000,
             459.000,  0.0000000,
             531.000,  0.0000000,
             615.000,  0.0000000,
             712.000,  0.0000000,
             825.000,  0.0000000,
             955.000,  0.0000000,
             1110.000,  0.0000000,
             1280.000,  0.0000000,
             1480.000,  0.0000000,
             1720.000,  0.0000000,
             1990.000,  0.0000000,
             2300.000,  0.0000000,
             2670.000,  0.0000000,
             3090.000,  0.0000000,
             3580.000,  0.0000000,
             4150.000,  0.0000000,
             4800.000,  0.0000000,
             5560.000,  0.0000000,
             6440.000,  0.0000000,
             7460.000,  0.0000000,
             8630.000,  0.0000000)

The code I'm using to plot is:

plot(log10(MK20$X.Intensity), MK20$Average, col=1, xlim=c(-0.5,4), 
    ylim=c(0,20), xlab="Log(Average diameter)", ylab="Intensity", xaxt='n')

I'm using the minor.tick.axis function to add minor ticks on the logarithmic x axis. I want to add a Gaussian curve (which fits best) to this data. I tried to add a type='l' on the plot but the curve wasn't smooth and I don't want a curve that necessarily touches every data point but one that fits best.

Upvotes: 1

Views: 1979

Answers (2)

Tony Ladson
Tony Ladson

Reputation: 3649

We can't use the usual fitdistr approach to fit a normal distribution in this case because we don't have the original data. It looks like the 'Average' column is some type of density estimate. If it was a pdf then it should integrated to 1 but it doesn't.

f <- approxfun(x = log10(MK20$X.Intensity), y= MK20$Average)
integrate(f, lower = log10(0.4), upper = log10(8630))

#6.142134 with absolute error < 0.00043

So we could turn this into a pdf by scaling it down by about 6.14 and then try finding the mean and standard deviation to match that pdf.

Here is a first attempt at a simple Gaussian fit. First I chose a mean of 2 (by looking at where the density was the greatest), a scaling factor of k = 6.14 (the value of the integral) and then played with the sd until there was a reasonable fit.

m=2
s=0.15
k=6.14
x_seq = seq(1,3,length.out = 100)
df <- tibble(x_seq = x_seq, dens = dnorm(x_seq, m, s))


MK20 %>% 
  mutate(log_intensity = log10(X.Intensity)) %>% 
  ggplot(aes(log_intensity, Average/k)) +
  geom_point() +
  geom_line(data = df, aes(x_seq, dens)) 

enter image description here

Next I used optimx to fit the 3 parameters (k = scaling factor, m = mean, s = standard deviation) by minimising the sum of squares between the fit and the data.

Objective function (sum of squares of differences between fit and data)

f <- function(x) {
  k = x[1]
  m = x[2]
  s = x[3]
  MK20 %>% 
  mutate(log_intensity = log10(X.Intensity)) %>%
  mutate(fit = dnorm(log_intensity, m, s)) %>% 
  summarise(sum((fit - Average/k)^2)) %>% pull
}

Use optimx to find parameters (minimise sum of squares) The initial values for the parameters are taken from the fit by eye.

library(optimx)    
optimx(par = c(6.14, 2, 0.15), fn = f )

#k = 6.294696 m = 1.971488 s= 0.1583936 

Lets replot with the fitted parameters

# points for a gaussian
x_seq = seq(1,3,length.out = 100) 
df <- tibble(x_seq = x_seq, dens = dnorm(x_seq, m, s))


MK20 %>% 
  mutate(log_intensity = log10(X.Intensity)) %>% 
  ggplot(aes(log_intensity, Average/k)) +
  geom_point() +
  geom_line(data = df, aes(x_seq, dens)) 

enter image description here

Upvotes: 0

Otto K&#228;ssi
Otto K&#228;ssi

Reputation: 3083

A curve that touches each point will fit your data best for sure. :)

That aside, you could try to include a smoothed curve, e.g.

plot(log10(MK20$X.Intensity), MK20$Average, col=1, xlim=c(-0.5,4), ylim=c(0,20), 
     xlab="Log(Average diameter)", ylab="Intensity", xaxt='n', type='n')
lines(lowess(MK20$Average ~ log10(MK20$X.Intensity), f=0.3))

You can vary the f= parameter between (0 and 1) to change the level of smoothing.

Here's what the output looks like with f=0.3.

enter image description here

Upvotes: 0

Related Questions