INFO 523 - Exploratory Data Analysis + Data Visualization

	fage	mage	mature	weeks	premie	visits	gained	weight	lowbirthweight	sex	habit	marital	whitemom
0	34.0	34	younger mom	37	full term	14.0	28.0	6.96	not low	male	nonsmoker	married	white
1	36.0	31	younger mom	41	full term	12.0	41.0	8.86	not low	female	nonsmoker	married	white
2	37.0	36	mature mom	37	full term	10.0	28.0	7.51	not low	female	nonsmoker	married	not white
3	NaN	16	younger mom	38	full term	NaN	29.0	6.19	not low	male	nonsmoker	not married	white
4	32.0	31	younger mom	36	premie	12.0	48.0	6.75	not low	female	nonsmoker	married	white

	fage	mage	weeks	visits	gained	weight
count	886.000000	1000.000000	1000.000000	944.000000	958.000000	1000.000000
mean	31.133183	28.449000	38.666000	11.351695	30.425887	7.198160
std	7.058135	5.759737	2.564961	4.108192	15.242527	1.306775
min	15.000000	14.000000	21.000000	0.000000	0.000000	0.750000
25%	26.000000	24.000000	38.000000	9.000000	20.000000	6.545000
50%	31.000000	28.000000	39.000000	12.000000	30.000000	7.310000
75%	35.000000	33.000000	40.000000	14.000000	38.000000	8.000000
max	85.000000	47.000000	46.000000	30.000000	98.000000	10.620000

	mature	premie	lowbirthweight	sex	habit	marital	whitemom
count	1000	1000	1000	1000	981	1000	1000
unique	2	2	2	2	2	2	2
top	younger mom	full term	not low	male	nonsmoker	married	white
freq	841	876	919	505	867	594	765

	opid	operator	atype	remarks	phase_of_flt	ac_mass	num_engs	date	time_of_day	state	height	speed	effect	sky	species	birds_seen	birds_struck
0	AAL	AMERICAN AIRLINES	MD-80	NO DAMAGE	Descent	4.0	2.0	1990-09-30	Night	IL	7000.0	250.0	NaN	No Cloud	UNKNOWN BIRD - MEDIUM	NaN	1
1	USA	US AIRWAYS	FK-28-4000	2 BIRDS, NO DAMAGE.	Climb	4.0	2.0	1993-11-29	Day	MD	10.0	140.0	NaN	No Cloud	UNKNOWN BIRD - MEDIUM	10-Feb	10-Feb
2	AAL	AMERICAN AIRLINES	B-727-200	NaN	Approach	4.0	3.0	1993-08-13	Day	TN	400.0	140.0	NaN	Some Cloud	UNKNOWN BIRD - SMALL	10-Feb	1
3	AAL	AMERICAN AIRLINES	MD-82	NaN	Climb	4.0	2.0	1993-10-07	Day	VA	100.0	200.0	NaN	Overcast	UNKNOWN BIRD - SMALL	NaN	1
4	AAL	AMERICAN AIRLINES	MD-82	NO DAMAGE	Climb	4.0	2.0	1993-09-25	Day	SC	50.0	170.0	NaN	Some Cloud	UNKNOWN BIRD - SMALL	10-Feb	1

	ac_mass	num_engs	date	height	speed
count	18018.00	17995.00	19302	16109.00	12294.00
mean	3.36	2.10	1994-08-25 09:46:40.994715520	754.68	136.10
min	1.00	1.00	1990-01-08 00:00:00	0.00	0.00
25%	3.00	2.00	1992-08-18 00:00:00	0.00	110.00
50%	4.00	2.00	1994-10-01 00:00:00	40.00	130.00
75%	4.00	2.00	1996-09-13 18:00:00	500.00	150.00
max	5.00	4.00	1999-10-16 00:00:00	32500.00	400.00
std	1.01	0.57	NaN	1795.81	44.64

How to choose a plot

One Numeric Variable

Histogram

Frequency Distribution
Easy to Interpret
Identifies Patterns

Density Plot

Smooth Distribution Curve
Highlights Density
Comparative Analysis

Histograms

Code

import seaborn as sns
import matplotlib.pyplot as plt

sns.displot(data = birds, x = "speed")

plt.show()import seaborn as sns
import matplotlib.pyplot as plt

sns.displot(data = birds, x = "speed")

plt.show()import seaborn as sns
import matplotlib.pyplot as plt

sns.displot(data = birds, x = "speed")

plt.show()

Histograms: bins

Code

sns.displot(data = birds, x = "speed", 
            bins = 15, height = 5, aspect = 8/5)

plt.show()sns.displot(data = birds, x = "speed", 
            bins = 15, height = 5, aspect = 8/5)

plt.show()

Density Plot

Code

sns.displot(data = birds, x = "speed", 
            kind = 'kde', height = 5, aspect = 8/5)

plt.show()sns.displot(data = birds, x = "speed", 
            kind = 'kde', height = 5, aspect = 8/5)

plt.show()

Two Numeric Variables

Scatterplot

Relationship Visualization
Outlier Identification
Pattern Recognition

2D Density Plot

Density Distributions
Combine Contour and Color
Complex Data Interpretation

Scatterplots

Code

sns.set_theme(style = "whitegrid")

sns.scatterplot(data = birds, x = "speed", y = "height")

plt.show()sns.set_theme(style = "whitegrid")

sns.scatterplot(data = birds, x = "speed", y = "height")

plt.show()

Scatterplots - color

Code

sns.set_theme(style = "whitegrid")

sns.scatterplot(data = birds, x = "speed", y = "height",
                hue = "sky")

plt.show()sns.set_theme(style = "whitegrid")

sns.scatterplot(data = birds, x = "speed", y = "height",
                hue = "sky")

plt.show()

Scatterplots - size + color

Code

sns.set_theme(style = "whitegrid")

sns.scatterplot(data = birds, x = "speed", y = "height",
                size = "num_engs", hue = "ac_mass")

plt.show()sns.set_theme(style = "whitegrid")

sns.scatterplot(data = birds, x = "speed", y = "height",
                size = "num_engs", hue = "ac_mass")

plt.show()

Scatterplots - linear relationships

Code

sns.lmplot(data = birds, x = "speed", y = "height",
           aspect = 8/5)

plt.show()sns.lmplot(data = birds, x = "speed", y = "height",
           aspect = 8/5)

plt.show()

Scatterplots - grouped relationships

Code

sns.lmplot(data = birds, x = "speed", y = "height",
           hue = "num_engs", aspect = 8/5)

plt.show()sns.lmplot(data = birds, x = "speed", y = "height",
           hue = "num_engs", aspect = 8/5)

plt.show()

2D Density Plots

Code

sns.histplot(data = birds, x = "speed", y = "height", 
             bins = 50, pthresh = 0.1, cmap = "mako")

plt.show()sns.histplot(data = birds, x = "speed", y = "height", 
             bins = 50, pthresh = 0.1, cmap = "mako")

plt.show()

2D Density plots: contours

Code

sns.kdeplot(data = birds, x = "speed", y = np.log(birds['height'] + 1), 
             thresh = 0.1, hue = "sky", palette = "colorblind")

plt.show()sns.kdeplot(data = birds, x = "speed", y = np.log(birds['height'] + 1), 
             thresh = 0.1, hue = "sky", palette = "colorblind")

plt.show()

2D Density plots: filled contours

Code

sns.kdeplot(data = birds, x = "speed", y = np.log(birds['height'] + 1), 
             thresh = 0, cmap = "mako", fill = True, levels = 10)

plt.show()sns.kdeplot(data = birds, x = "speed", y = np.log(birds['height'] + 1), 
             thresh = 0, cmap = "mako", fill = True, levels = 10)

plt.show()

Two Ordered Numeric Variables

Line Plot

Trend Identification
Simple and Clear
Comparative Analysis

Area Plot

Cumulative Representation
Emphasizes Volume
Layered Comparisons

Line Plot

Code

sns.set_theme(style = "whitegrid")

sns.lineplot(data = birds, x = "date", y = "speed")

plt.show()sns.set_theme(style = "whitegrid")

sns.lineplot(data = birds, x = "date", y = "speed")

plt.show()

Line Plot: grouped lines

Code

sns.lineplot(data = birds, x = "date", y = "speed",
             hue = "sky")

plt.show()sns.lineplot(data = birds, x = "date", y = "speed",
             hue = "sky")

plt.show()

One Categorical

Barplot

Categorical Comparison
Clear Visualization
Versatile Use

Pie Chart

Proportional Representation
Simple Interpretation
Visual Appeal

Barplot

Code

sns.set_theme(style = "white")

sns.countplot(data = birds, x = "sky", 
              palette = "colorblind")
              
plt.show()sns.set_theme(style = "white")

sns.countplot(data = birds, x = "sky", 
              palette = "colorblind")
              
plt.show()

Pie Chart

Can’t use {seaborn}

Code

category_counts = birds['sky'].value_counts()

plt.pie(category_counts, labels = category_counts.index, 
        autopct = lambda p: f'{p:.1f}%', 
        textprops = {'size':14})

plt.axis('equal')

plt.show()category_counts = birds['sky'].value_counts()

plt.pie(category_counts, labels = category_counts.index, 
        autopct = lambda p: f'{p:.1f}%', 
        textprops = {'size':14})

plt.axis('equal')

plt.show()category_counts = birds['sky'].value_counts()

plt.pie(category_counts, labels = category_counts.index, 
        autopct = lambda p: f'{p:.1f}%', 
        textprops = {'size':14})

plt.axis('equal')

plt.show()category_counts = birds['sky'].value_counts()

plt.pie(category_counts, labels = category_counts.index, 
        autopct = lambda p: f'{p:.1f}%', 
        textprops = {'size':14})

plt.axis('equal')

plt.show()

One Numerical + One Categorical

Boxplot

Displays Quartiles
Identifies Outliers
Comparative Analysis

Violin chart

Density Representation
Richer Data Insight
Visualizes Data Spread

Boxplots

Code

sns.set_style("whitegrid")

sns.boxplot(data = birds, x = "speed", y = "sky",
            hue = "sky", palette = "colorblind")

plt.show()sns.set_style("whitegrid")

sns.boxplot(data = birds, x = "speed", y = "sky",
            hue = "sky", palette = "colorblind")

plt.show()sns.set_style("whitegrid")

sns.boxplot(data = birds, x = "speed", y = "sky",
            hue = "sky", palette = "colorblind")

plt.show()

Trim axes

Code

sns.set_style("whitegrid")

p1 = sns.boxplot(data = birds, x = "speed", y = "sky",
                 hue = "sky", palette = "colorblind")

p1.set_xlim(0, 250)

plt.show()sns.set_style("whitegrid")

p1 = sns.boxplot(data = birds, x = "speed", y = "sky",
                 hue = "sky", palette = "colorblind")

p1.set_xlim(0, 250)

plt.show()

Violin Plots

Code

sns.set_style("white")

sns.violinplot(data = birds, x = "speed", y = "sky", hue = "sky",
                    palette = "colorblind")
               
plt.show()sns.set_style("white")

sns.violinplot(data = birds, x = "speed", y = "sky", hue = "sky",
                    palette = "colorblind")
               
plt.show()

Violin Plots: paired

Code

sns.set_style("white")

options = ['Night', 'Day']

birds_filt = birds[birds['time_of_day'].isin(options)]

sns.violinplot(data = birds_filt, x = "speed", y = "sky", hue = "time_of_day",
                    palette = "colorblind")
               
plt.show()sns.set_style("white")

options = ['Night', 'Day']

birds_filt = birds[birds['time_of_day'].isin(options)]

sns.violinplot(data = birds_filt, x = "speed", y = "sky", hue = "time_of_day",
                    palette = "colorblind")
               
plt.show()sns.set_style("white")

options = ['Night', 'Day']

birds_filt = birds[birds['time_of_day'].isin(options)]

sns.violinplot(data = birds_filt, x = "speed", y = "sky", hue = "time_of_day",
                    palette = "colorblind")
               
plt.show()

Violin Plots: quartiles + split

Code

sns.set_style("white")

sns.violinplot(data = birds_filt, x = "speed", y = "sky", hue = "time_of_day",
                    palette = "colorblind",
                    inner = "quart", split = True)
               
plt.show()sns.set_style("white")

sns.violinplot(data = birds_filt, x = "speed", y = "sky", hue = "time_of_day",
                    palette = "colorblind",
                    inner = "quart", split = True)
               
plt.show()

Cleaning up our plots

My minimum expectation:

Code

sns.set_style("white")

g1 = sns.violinplot(data = birds_filt, x = "speed", y = "sky", hue = "time_of_day",
                    palette = "colorblind")

g1.set(xlabel = "Speed (mph)")
g1.set(ylabel = None)
g1.set(title = "Speed of plane collisions with birds")
g1.legend(title = "Time of day")

plt.show()sns.set_style("white")

g1 = sns.violinplot(data = birds_filt, x = "speed", y = "sky", hue = "time_of_day",
                    palette = "colorblind")

g1.set(xlabel = "Speed (mph)")
g1.set(ylabel = None)
g1.set(title = "Speed of plane collisions with birds")
g1.legend(title = "Time of day")

plt.show()

Aside: Correlations

Code

sns.set_theme(style = "white")

birds_num = birds.select_dtypes(include = 'number')

corr = birds_num.corr()

mask = np.triu(np.ones_like(corr, dtype = bool))

f, ax = plt.subplots(figsize = (8, 6))

cmap = sns.diverging_palette(230, 20, as_cmap = True)

sns.heatmap(corr, mask = mask, cmap = cmap, vmax = 0.5, center = 0,
            square = True, linewidths = .5, cbar_kws = {"shrink": 0.5})
            
plt.show()sns.set_theme(style = "white")

birds_num = birds.select_dtypes(include = 'number')

corr = birds_num.corr()

mask = np.triu(np.ones_like(corr, dtype = bool))

f, ax = plt.subplots(figsize = (8, 6))

cmap = sns.diverging_palette(230, 20, as_cmap = True)

sns.heatmap(corr, mask = mask, cmap = cmap, vmax = 0.5, center = 0,
            square = True, linewidths = .5, cbar_kws = {"shrink": 0.5})
            
plt.show()sns.set_theme(style = "white")

birds_num = birds.select_dtypes(include = 'number')

corr = birds_num.corr()

mask = np.triu(np.ones_like(corr, dtype = bool))

f, ax = plt.subplots(figsize = (8, 6))

cmap = sns.diverging_palette(230, 20, as_cmap = True)

sns.heatmap(corr, mask = mask, cmap = cmap, vmax = 0.5, center = 0,
            square = True, linewidths = .5, cbar_kws = {"shrink": 0.5})
            
plt.show()sns.set_theme(style = "white")

birds_num = birds.select_dtypes(include = 'number')

corr = birds_num.corr()

mask = np.triu(np.ones_like(corr, dtype = bool))

f, ax = plt.subplots(figsize = (8, 6))

cmap = sns.diverging_palette(230, 20, as_cmap = True)

sns.heatmap(corr, mask = mask, cmap = cmap, vmax = 0.5, center = 0,
            square = True, linewidths = .5, cbar_kws = {"shrink": 0.5})
            
plt.show()sns.set_theme(style = "white")

birds_num = birds.select_dtypes(include = 'number')

corr = birds_num.corr()

mask = np.triu(np.ones_like(corr, dtype = bool))

f, ax = plt.subplots(figsize = (8, 6))

cmap = sns.diverging_palette(230, 20, as_cmap = True)

sns.heatmap(corr, mask = mask, cmap = cmap, vmax = 0.5, center = 0,
            square = True, linewidths = .5, cbar_kws = {"shrink": 0.5})
            
plt.show()sns.set_theme(style = "white")

birds_num = birds.select_dtypes(include = 'number')

corr = birds_num.corr()

mask = np.triu(np.ones_like(corr, dtype = bool))

f, ax = plt.subplots(figsize = (8, 6))

cmap = sns.diverging_palette(230, 20, as_cmap = True)

sns.heatmap(corr, mask = mask, cmap = cmap, vmax = 0.5, center = 0,
            square = True, linewidths = .5, cbar_kws = {"shrink": 0.5})
            
plt.show()

Lastly: Pairgrids

Code

sns.set_theme(style = "white")

birds_sub = birds[['ac_mass', 'height', 'speed', 'sky']]

g = sns.PairGrid(birds_sub, diag_sharey = False, 
                 height = 2, hue = "sky")
g.map_upper(sns.scatterplot, s = 15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw = 2)

plt.show()sns.set_theme(style = "white")

birds_sub = birds[['ac_mass', 'height', 'speed', 'sky']]

g = sns.PairGrid(birds_sub, diag_sharey = False, 
                 height = 2, hue = "sky")
g.map_upper(sns.scatterplot, s = 15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw = 2)

plt.show()sns.set_theme(style = "white")

birds_sub = birds[['ac_mass', 'height', 'speed', 'sky']]

g = sns.PairGrid(birds_sub, diag_sharey = False, 
                 height = 2, hue = "sky")
g.map_upper(sns.scatterplot, s = 15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw = 2)

plt.show()sns.set_theme(style = "white")

birds_sub = birds[['ac_mass', 'height', 'speed', 'sky']]

g = sns.PairGrid(birds_sub, diag_sharey = False, 
                 height = 2, hue = "sky")
g.map_upper(sns.scatterplot, s = 15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw = 2)

plt.show()

Diwali sales data: metadata

variable	class	description
User_ID	double	User identification number
Cust_name	character	Customer name
Product_ID	character	Product identification number
Gender	character	Gender of the customer (e.g. Male, Female)
Age Group	character	Age group of the customer
Age	double	Age of the customer
Marital_Status	double	Marital status of customer (Married, Single)
State	character	State of the customer
Zone	character	Geographic zone of the customer
Occupation	character	Occupation of the customer
Product_Category	character	Category of the product
Orders	double	Number of orders made by the customer
Amount	double	Amount in Indian rupees spent by the customer

diwali = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-14/diwali_sales_data.csv', encoding = 'iso-8859-1')
diwali.head()

	User_ID	Cust_name	Product_ID	Gender	Age Group	Age	Marital_Status	State	Zone	Occupation	Product_Category	Orders	Amount
0	1002903	Sanskriti	P00125942	F	26-35	28	0	Maharashtra	Western	Healthcare	Auto	1	23952.0
1	1000732	Kartik	P00110942	F	26-35	35	1	Andhra Pradesh	Southern	Govt	Auto	3	23934.0
2	1001990	Bindu	P00118542	F	26-35	35	1	Uttar Pradesh	Central	Automobile	Auto	3	23924.0
3	1001425	Sudevi	P00237842	M	0-17	16	0	Karnataka	Southern	Construction	Auto	2	23912.0
4	1000588	Joni	P00057942	M	26-35	28	1	Gujarat	Western	Food Processing	Auto	2	23877.0

Code

# Examine data
diwali.info()

# Data types
diwali.dtypes

# Describe numerical columns
diwali.describe()

# Describe categories
diwali.describe(exclude = [np.number])

# Unique levels
categorical_cols = diwali.select_dtypes(include = ['object', 'category']).columns
unique_levels = diwali[col].unique()

# Outliers
# Make a copy of the diwali data 
dataCopy = diwali.copy()

# Select only numerical columns
dataRed = dataCopy.select_dtypes(include = np.number)

# List of numerical columns
dataRedColsList = dataRed.columns[...]

# For all values in the numerical column list from above
for i_col in dataRedColsList:
  # List of the values in i_col
  dataRed_i = dataRed.loc[:,i_col]
  
  # Define the 25th and 75th percentiles
  q25, q75 = round((dataRed_i.quantile(q = 0.25)), 3), round((dataRed_i.quantile(q = 0.75)), 3)
  
  # Define the interquartile range from the 25th and 75th percentiles defined above
  IQR = round((q75 - q25), 3)
  
  # Calculate the outlier cutoff 
  cut_off = IQR * 1.5
  
  # Define lower and upper cut-offs
  lower, upper = round((q25 - cut_off), 3), round((q75 + cut_off), 3)
  
  # Print the values
  print(' ')
  
  # For each value of i_col, print the 25th and 75th percentiles and IQR
  print(i_col, 'q25 =', q25, 'q75 =', q75, 'IQR =', IQR)
  
  # Print the lower and upper cut-offs
  print('lower, upper:', lower, upper)

  # Count the number of outliers outside the (lower, upper) limits, print that value
  print('Number of Outliers: ', dataRed_i[(dataRed_i < lower) | (dataRed_i > upper)].count())

# Missing values
diwali.isnull().sum()

# Normality - qq plot
# Change theme to "white"
sns.set_style("white")

# Make a copy of the data 
dataCopy = diwali.copy()

# Remove NAs
dataCopyFin = dataCopy.dropna()

# Specify desired column
i_col = dataCopyFin.Amount

# Subplots
fig, (ax1, ax2) = plt.subplots(ncols = 2, nrows = 1)

# Density plot
sns.kdeplot(i_col, linewidth = 5, ax = ax1)
ax1.set_title('Amount spent (₹)')

# Q-Q plot
sm.qqplot(i_col, line = 's', ax = ax2)
ax2.set_title('Amount spent Q-Q plot')
plt.tight_layout()
plt.show()