Reputation: 647
In order to create a dataset to test a statistical calculation package, I want to be able to generate a sample that is correlated to a reference sample with a given searman coefficient.
I managed to do it for the Pearson coefficient, but the fact that Spearman works on the rank makes it quite tricky, to say the least!
As an example, with a code to generate spearman and pearson correlated samples:
from statistics import correlation
import numpy as np
from scipy import stats
def generate_pearson_correlated_to_sample(x, correlation):
"""
Generate a variable with a specified Pearson correlation coefficient to a given sample.
Parameters
----------
x : array-like
The fixed first sample.
correlation : float
Desired Pearson correlation coefficient (-1 <= correlation <= 1).
Returns
-------
array-like
The generated sample y with specified Pearson correlation to x.
"""
# standardize the pixel data
x_std = (x - x.mean()) / x.std()
# generate independent standard normal data
z = np.random.normal(loc=0, scale=1, size=x_std.shape)
# Create correlated variable (standardized)
y_std = correlation * x_std + np.sqrt(1 - correlation ** 2) * z
# Scale to desired standard deviation and add mean
mean = x.mean()
std = x.std()
y = y_std * std + mean
return y
def generate_spearman_correlated_to_sample(x, correlation):
"""
Generate a variable with a specified Spearman correlation coefficient to a given sample.
Parameters
----------
x : array-like
The fixed first sample.
correlation : float
Desired Spearman correlation coefficient (-1 <= correlation <= 1).
Returns
-------
array-like
The generated sample y with specified Spearman correlation to x.
"""
n_samples = len(x)
# Convert x to ranks (normalized between 0 and 1)
x_ranks = stats.rankdata(x) / (n_samples + 1)
# Convert ranks to normal distribution
normal_x = stats.norm.ppf(x_ranks)
# Generate correlated normal variable
normal_y = correlation * normal_x + np.sqrt(1 - correlation ** 2) * np.random.normal(0, 1, n_samples)
# Convert back to uniform distribution
y_uniform = stats.norm.cdf(normal_y)
# Convert uniform to same marginal distribution as x using empirical CDF
x_sorted = np.sort(x)
y = np.interp(y_uniform, np.linspace(0, 1, n_samples), x_sorted)
return y
def verify_correlations(x, y):
"""
Calculate both Spearman and Pearson correlations between two variables.
Parameters:
-----------
x, y : array-like
The two variables to check
Returns:
--------
tuple
(spearman_correlation, pearson_correlation)
"""
spearman_corr = stats.spearmanr(x, y)[0]
pearson_corr = stats.pearsonr(x, y)[0]
return spearman_corr, pearson_corr
# Example usage
if __name__ == "__main__":
# Set random seed for reproducibility
np.random.seed(42)
# Create different types of example data
x_normal = np.random.normal(0, 1, 10000) # Normal distribution
x_exp = np.random.exponential(2, 10000) # Exponential distribution
x_bimodal = np.concatenate([np.random.normal(-2, 0.5, 5000),
np.random.normal(2, 0.5, 5000)]) # Bimodal distribution
# Test with different distributions and correlations
test_cases = [
(x_normal, 0.7, "Normal Distribution"),
(x_exp, 0.5, "Exponential Distribution"),
(x_bimodal, 0.8, "Bimodal Distribution")
]
# Run examples
for x, target_corr, title in test_cases:
print(f"\nTesting with {title}")
print(f"Target correlation: {target_corr}")
# Generate correlated sample
y_spearman = generate_spearman_correlated_to_sample(x, correlation=target_corr)
y_pearson = generate_pearson_correlated_to_sample(x, correlation=target_corr)
# Calculate actual correlations
spearman_corr, _= verify_correlations(x, y_spearman)
_, pearson_corr = verify_correlations(x, y_pearson)
print(f"Achieved Spearman correlation: {spearman_corr:.4f}")
print(f"Achieved Pearson correlation: {pearson_corr:.4f}")
With the above code, the generated Pearson coefficient is of course not exactly equal to the targeted value due to the random nature of the code. But I find that Spearman is systematically off by a much larger amount, which makes me suspecting a problem in my code.
I work in Python but any help is appreciated!
Upvotes: 1
Views: 49
Reputation: 647
I finally managed to improve the above code for spearman correlation:
def generate_spearman_correlated_to_sample_bis(x, correlation):
"""
Generate a variable with a specified Spearman correlation coefficient to a given sample.
Parameters
----------
x : array-like
The fixed first sample.
correlation : float
Desired Spearman correlation coefficient (-1 <= correlation <= 1).
Returns
-------
array-like
The generated sample y with specified Spearman correlation to x.
"""
n_samples = len(x)
# Convert x to ranks
x_ranks = stats.rankdata(x)# / (n_samples + 1)
x_std = (x_ranks-x_ranks.mean()) / x_ranks.std()
# generate independent standard normal data
z = np.random.normal(loc=0, scale=1, size=x_std.shape)
# Create correlated variable (standardized)
y_std = correlation * x_std + np.sqrt(1 - correlation ** 2) * z
# Rescale to obtain ranks (values between 1 and n_samples)
y_ranks=1+ (n_samples-1)*(y_std-min(y_std))/(max(y_std)-min(y_std))
# resample to respect the ranks
x_sorted = np.sort(x)
y = np.interp(y_ranks, np.arange(1, n_samples+1), x_sorted)
return y
I removed the part where I project the ranks as a normal distribution. It is much more stable and give results much more acurate.
Upvotes: 0
Reputation: 15283
One simple approach is to toss it into least-squares as a polishing method, with your initial approach as an estimate:
def generate_pearson_correlated_to_sample(
x: np.ndarray,
correlation: float,
) -> np.ndarray:
xerror = x - x.mean()
lhs = correlation*np.sqrt(xerror.dot(xerror))
# standardize the pixel data
mean = x.mean()
std = x.std()
x_std = (x - mean)/std
# generate independent standard normal data
z = np.random.normal(loc=0, scale=1, size=x_std.shape)
# Create correlated variable (standardized)
y_std = correlation*x_std + np.sqrt(1 - correlation**2)*z
# Scale to desired standard deviation and add mean
y0 = y_std*std + mean
def error_fun(y: np.ndarray) -> float:
yerror = y - y.mean()
corr_error = xerror.dot(yerror) - lhs*np.sqrt(yerror.dot(yerror))
return corr_error
result = least_squares(
fun=error_fun, x0=y0,
method='dogbox', jac='2-point', x_scale='jac', loss='linear')
assert result.success
return result.x
Shown only with Pearson; Spearman will be the same:
Testing with Normal Distribution
Target correlation: 0.7
Achieved Pearson correlation: 0.7000
Testing with Exponential Distribution
Target correlation: 0.5
Achieved Pearson correlation: 0.5000
Testing with Bimodal Distribution
Target correlation: 0.8
Achieved Pearson correlation: 0.8000
Upvotes: 0