Reputation: 3
I have this dataset: kaggle: House Prices: Advanced Regression Techniques. I loaded it as follows:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as st
train = pd.read_csv("train.csv")
I then attempted to plot SalePrice by YearBuilt using OverallQual as the hue:
plt.figure(figsize = [8, 6])
sns.factorplot(
x = 'YearBuilt',
y = 'SalePrice',
data = train,
kind='swarm',
hue='OverallQual',
palette = 'Set2')
plt.title('House Sale Prices (log scale) vs. Year Built and Overall Quality')
plt.xticks([1880, 1900, 1920, 1940, 1960, 1980, 2000], ['1880', '1900', '1920', '1940', '1960', '1980', '2000'])
#plt.xlim([1872,2009])
plt.xlabel('Year Built')
plt.yscale('log')
#plt.ylim([4e4, 6e5])
plt.yticks([6e4, 1e5, 1.4e5, 1.8e5, 2.4e5, 3.2e5, 4e5, 6e5], ['60k', '100k', '140K', '180k', '240K', '320k', '600k'])
plt.ylabel('Sale Price ($)')
plt.show()
Which results in the following graph:
If I include the xlim or ylim the graph doesn't show up at all.
I'm new to this so any help is greatly appreciated.
Upvotes: 0
Views: 144
Reputation: 62513
plt.style.use('ggplot')
g = sns.catplot(
x = 'YearBuilt',
y = 'SalePrice',
data = train,
kind='swarm',
hue='OverallQual',
palette = 'Set2', height=6, aspect=2)
locs, labels = plt.xticks() # use if needed, as explained below.
for axes in g.axes.flat:
for ind, label in enumerate(axes.get_xticklabels()):
if ind % 8 == 0: # every nth label is kept
label.set_visible(True)
else:
label.set_visible(False)
plt.title('House Sale Prices (log scale) vs. Year Built and Overall Quality')
plt.xlabel('Year Built')
plt.yscale('log')
plt.ylim([4e4, 6e5])
plt.yticks([6e4, 1e5, 1.4e5, 1.8e5, 2.4e5, 3.2e5, 4e5, 6e5], ['60k', '100k', '140K', '180k', '240K', '320k', '600k'])
plt.ylabel('Sale Price ($)')
plt.show()
sns.factorplot
is deprecated for sns.catplot
plt.xticks([1880, 1900, 1920, 1940, 1960, 1980, 2000], ['1880', '1900', '1920', '1940', '1960', '1980', '2000'])
attempts to replace 112 xticks and xticklabels
locs, labels = plt.xticks()
, locs is a list from 0 to 111, while the locs you try, are 1880 to 2000, which is why all the data is on the left side. Essentially, the xaxis range is now from 0 to 2000.int
not a datetime
locs, labels = plt.xticks()
. Labels is a generator function and can be unpacked with labels = [*labels]
.type(labels[0])
is matplotlib.text.Text
, so to get a list of only the labels, use label_t = [x.get_text() for x in labels]
.years = ['1880', '1900', '1920', '1940', '1960', '1980', '2000']
xticks = [label_t.index(year) for year in years]
print(xticks)
>>> [2, 9, 24, 43, 61, 81, 101]
plt.style.use('ggplot')
g = sns.catplot(
x = 'YearBuilt',
y = 'SalePrice',
data = train,
kind='swarm',
hue='OverallQual',
palette = 'Set2', height=6, aspect=2)
plt.title('House Sale Prices (log scale) vs. Year Built and Overall Quality')
plt.xticks(xticks, ['1880', '1900', '1920', '1940', '1960', '1980', '2000'])
# plt.xlim([1872,2009])
plt.xlabel('Year Built')
plt.yscale('log')
# plt.ylim([4e4, 6e5])
plt.yticks([6e4, 1e5, 1.4e5, 1.8e5, 2.4e5, 3.2e5, 4e5, 6e5], ['60k', '100k', '140K', '180k', '240K', '320k', '600k'])
plt.ylabel('Sale Price ($)')
plt.show()
np.sort(train.YearBuilt.unique())
array([1872, 1875, 1880, 1882, 1885, 1890, 1892, 1893, 1898, 1900, 1904,
1905, 1906, 1908, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917,
1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928,
1929, 1930, 1931, 1932, 1934, 1935, 1936, 1937, 1938, 1939, 1940,
1941, 1942, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953,
1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964,
1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
2009, 2010], dtype=int64)
Upvotes: 2