Project 01 peer-review is Feb 07, first round proposals are due before class
Setup
# Import all required librariesimport pandas as pdimport numpy as npimport seaborn as snsimport matplotlib.pyplot as pltimport scipy.stats as statsfrom scipy.stats import skewnormfrom scipy.stats import kurtosis, normfrom scipy.stats import gammaimport missingno as msnoimport randomimport statsmodels.api as sm# Increase font size of all Seaborn plot elementssns.set(font_scale =1.25)# Load in UK Smoking Databirths14 = pd.read_csv("data/births14.csv")# Set seedrandom.seed(123)
Exploratory Data Analysis
What is exploratory data analysis?
Exploratory data analysis is a statistical, approach towards analyzing data sets to investigate and summarize their main characteristics, often through statistical graphics and other data visualization methods.
# Example with the premie columngroups = births14.groupby('premie').describe().unstack(1)# Print all rowsprint(groups.to_string())
premie
fage count full term 775.000000
premie 111.000000
mean full term 30.967742
premie 32.288288
std full term 6.681591
premie 9.226826
min full term 15.000000
premie 15.000000
25% full term 26.000000
premie 27.000000
50% full term 31.000000
premie 32.000000
75% full term 35.000000
premie 36.000000
max full term 49.000000
premie 85.000000
mage count full term 876.000000
premie 124.000000
mean full term 28.329909
premie 29.290323
std full term 5.721104
premie 5.982052
min full term 14.000000
premie 16.000000
25% full term 24.000000
premie 24.000000
50% full term 28.000000
premie 30.000000
75% full term 33.000000
premie 34.000000
max full term 44.000000
premie 47.000000
weeks count full term 876.000000
premie 124.000000
mean full term 39.376712
premie 33.645161
std full term 1.469571
premie 3.009993
min full term 37.000000
premie 21.000000
25% full term 38.000000
premie 33.000000
50% full term 39.000000
premie 35.000000
75% full term 40.000000
premie 36.000000
max full term 46.000000
premie 36.000000
visits count full term 829.000000
premie 115.000000
mean full term 11.516285
premie 10.165217
std full term 3.884353
premie 5.329380
min full term 0.000000
premie 0.000000
25% full term 10.000000
premie 7.000000
50% full term 12.000000
premie 10.000000
75% full term 14.000000
premie 12.000000
max full term 30.000000
premie 30.000000
gained count full term 839.000000
premie 119.000000
mean full term 30.410012
premie 30.537815
std full term 15.021661
premie 16.785683
min full term 0.000000
premie 0.000000
25% full term 20.000000
premie 20.000000
50% full term 30.000000
premie 29.000000
75% full term 38.000000
premie 41.000000
max full term 98.000000
premie 85.000000
weight count full term 876.000000
premie 124.000000
mean full term 7.434178
premie 5.530806
std full term 1.021699
premie 1.801182
min full term 3.930000
premie 0.750000
25% full term 6.770000
premie 4.500000
50% full term 7.440000
premie 5.750000
75% full term 8.082500
premie 6.572500
max full term 10.620000
premie 9.250000
Outliers
Outliers = 1.5 * Interquartile range
Assess outliers visually
# Change theme to "white"sns.set_style("white")# Boxplot of all numerical variablessns.boxplot(data = births14, x ='weight', width =0.20)plt.show()
fage q25 = 26.0 q75 = 35.0 IQR = 9.0
lower, upper: 12.5 48.5
Number of Outliers: 7
mage q25 = 24.0 q75 = 33.0 IQR = 9.0
lower, upper: 10.5 46.5
Number of Outliers: 1
weeks q25 = 38.0 q75 = 40.0 IQR = 2.0
lower, upper: 35.0 43.0
Number of Outliers: 72
visits q25 = 9.0 q75 = 14.0 IQR = 5.0
lower, upper: 1.5 21.5
Number of Outliers: 30
gained q25 = 20.0 q75 = 38.0 IQR = 18.0
lower, upper: -7.0 65.0
Number of Outliers: 26
weight q25 = 6.545 q75 = 8.0 IQR = 1.455
lower, upper: 4.362 10.183
Number of Outliers: 32
# Make a copy of the births14 data dataCopy = births14.copy()# Select only numerical columnsdataRed = dataCopy.select_dtypes(include = np.number)# List of numerical columnsdataRedColsList = dataRed.columns[...]# For all values in the numerical column list from abovefor i_col in dataRedColsList:# List of the values in i_col dataRed_i = dataRed.loc[:,i_col]# Define the 25th and 75th percentiles q25, q75 =round((dataRed_i.quantile(q =0.25)), 3), round((dataRed_i.quantile(q =0.75)), 3)# Define the interquartile range from the 25th and 75th percentiles defined above IQR =round((q75 - q25), 3)# Calculate the outlier cutoff cut_off = IQR *1.5# Define lower and upper cut-offs lower, upper =round((q25 - cut_off), 3), round((q75 + cut_off), 3)# Print the valuesprint(' ')# For each value of i_col, print the 25th and 75th percentiles and IQRprint(i_col, 'q25 =', q25, 'q75 =', q75, 'IQR =', IQR)# Print the lower and upper cut-offsprint('lower, upper:', lower, upper)# Count the number of outliers outside the (lower, upper) limits, print that valueprint('Number of Outliers: ', dataRed_i[(dataRed_i < lower) | (dataRed_i > upper)].count())
# Select numerical columnsnumerical_cols = births14.select_dtypes(include = ['number']).columnsfor col in numerical_cols:# Find Q1, Q3, and interquartile range (IQR) for each column Q1 = births14[col].quantile(0.25) Q3 = births14[col].quantile(0.75) IQR = Q3 - Q1# Upper and lower bounds for each column lower_bound = Q1 -1.5* IQR upper_bound = Q3 +1.5* IQR# Filter out the outliers from the DataFrame births14_clean = births14[(births14[col] >= lower_bound) & (births14[col] <= upper_bound)]
Why are there still outliers?
Missing values (NaN)
# Sum of NAs in each column (should be the same, 10% of all) births14.isnull().sum()
Analysis for mature:
Unique Levels: ['younger mom' 'mature mom']
Counts:
mature
younger mom 841
mature mom 159
Name: count, dtype: int64
Proportions:
mature
younger mom 0.841
mature mom 0.159
Name: proportion, dtype: float64
--------------------------------------------------
Analysis for premie:
Unique Levels: ['full term' 'premie']
Counts:
premie
full term 876
premie 124
Name: count, dtype: int64
Proportions:
premie
full term 0.876
premie 0.124
Name: proportion, dtype: float64
--------------------------------------------------
Analysis for lowbirthweight:
Unique Levels: ['not low' 'low']
Counts:
lowbirthweight
not low 919
low 81
Name: count, dtype: int64
Proportions:
lowbirthweight
not low 0.919
low 0.081
Name: proportion, dtype: float64
--------------------------------------------------
Analysis for sex:
Unique Levels: ['male' 'female']
Counts:
sex
male 505
female 495
Name: count, dtype: int64
Proportions:
sex
male 0.505
female 0.495
Name: proportion, dtype: float64
--------------------------------------------------
Analysis for habit:
Unique Levels: ['nonsmoker' 'smoker' nan]
Counts:
habit
nonsmoker 867
smoker 114
Name: count, dtype: int64
Proportions:
habit
nonsmoker 0.883792
smoker 0.116208
Name: proportion, dtype: float64
--------------------------------------------------
Analysis for marital:
Unique Levels: ['married' 'not married']
Counts:
marital
married 594
not married 406
Name: count, dtype: int64
Proportions:
marital
married 0.594
not married 0.406
Name: proportion, dtype: float64
--------------------------------------------------
Analysis for whitemom:
Unique Levels: ['white' 'not white']
Counts:
whitemom
white 765
not white 235
Name: count, dtype: int64
Proportions:
whitemom
white 0.765
not white 0.235
Name: proportion, dtype: float64
--------------------------------------------------
# Select categorical columnscategorical_cols = births14.select_dtypes(include = ['object', 'category']).columns# Initialize a dictionary to store resultscategory_analysis = {}# Loop through each categorical columnfor col in categorical_cols: counts = births14[col].value_counts() proportions = births14[col].value_counts(normalize=True) unique_levels = births14[col].unique()# Store results in dictionary category_analysis[col] = {'Unique Levels': unique_levels,'Counts': counts,'Proportions': proportions }# Print resultsfor col, data in category_analysis.items():print(f"Analysis for {col}:\n")print("Unique Levels:", data['Unique Levels'])print("\nCounts:\n", data['Counts'])print("\nProportions:\n", data['Proportions'])print("\n"+"-"*50+"\n")
Conditions of normality
Histogram: bell-shaped curve
Skewness: Close to 0 for symmetry; Kurtosis: Close to 3 for normal “tailedness.”
Sample Size: Larger samples are less sensitive to non-normality.
Empirical Rule: 68-95-99.7% rule (data within 1, 2, and 3 st dev. of the mean).
# Change theme to "white"sns.set_style("white")# Make a copy of the data dataCopy = births14.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Specify desired columni_col = dataCopyFin.weight# Subplotsfig, (ax1, ax2) = plt.subplots(ncols =2, nrows =1)# Density plotsns.kdeplot(i_col, linewidth =5, ax = ax1)ax1.set_title('Newborn Weight Density plot')# Q-Q plotsm.qqplot(i_col, line='s', ax = ax2)ax2.set_title('Newborn Weight Q-Q plot')plt.tight_layout()plt.show()
Positive-skew (left-tailed)
Code
# Change theme to "white"sns.set_style("white")# Make a copy of the data dataCopy = births14.copy()# Select only numerical columnsdataRed = dataCopyFin.select_dtypes(include=np.number)# Fill the subplotsfor k in dataRed.columns:# Create a figure with two subplots fig, (ax1, ax2) = plt.subplots(ncols =2, nrows =1)# Density plot sns.kdeplot(dataRed[k], linewidth =5, ax = ax1) ax1.set_title(f'{k} Density Plot')# Q-Q plot sm.qqplot(dataRed[k], line ='s', ax = ax2) ax2.set_title(f'{k} QQ Plot') plt.tight_layout() plt.show()
Conclusions
Inspect all data immediately
Assess outliers and missing values
Assess normality
Make corrections as needed (more next time)
Exploratory plotting
Data visualization
The practice of designing and creating easy-to-communicate and easy-to-understand graphic or visual representations of a large amount of complex quantitative and qualitative data and information with the help of static, dynamic or interactive visual items.
My definition: telling a story with your data, visually.
Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.
sns.set_style("whitegrid")sns.boxplot(data = birds, x ="speed", y ="sky", hue ="sky", palette ="colorblind")plt.show()
Trim axes
Code
sns.set_style("whitegrid")p1 = sns.boxplot(data = birds, x ="speed", y ="sky", hue ="sky", palette ="colorblind")p1.set_xlim(0, 250)plt.show()
Violin Plots
Code
sns.set_style("white")sns.violinplot(data = birds, x ="speed", y ="sky", hue ="sky", palette ="colorblind")plt.show()
Violin Plots: paired
Code
sns.set_style("white")options = ['Night', 'Day']birds_filt = birds[birds['time_of_day'].isin(options)]sns.violinplot(data = birds_filt, x ="speed", y ="sky", hue ="time_of_day", palette ="colorblind")plt.show()
Violin Plots: quartiles + split
Code
sns.set_style("white")sns.violinplot(data = birds_filt, x ="speed", y ="sky", hue ="time_of_day", palette ="colorblind", inner ="quart", split =True)plt.show()
Cleaning up our plots
My minimum expectation:
Code
sns.set_style("white")g1 = sns.violinplot(data = birds_filt, x ="speed", y ="sky", hue ="time_of_day", palette ="colorblind")g1.set(xlabel ="Speed (mph)")g1.set(ylabel =None)g1.set(title ="Speed of plane collisions with birds")g1.legend(title ="Time of day")plt.show()
# Examine datadiwali.info()# Data typesdiwali.dtypes# Describe numerical columnsdiwali.describe()# Describe categoriesdiwali.describe(exclude = [np.number])# Unique levelscategorical_cols = diwali.select_dtypes(include = ['object', 'category']).columnsunique_levels = diwali[col].unique()# Outliers# Make a copy of the diwali data dataCopy = diwali.copy()# Select only numerical columnsdataRed = dataCopy.select_dtypes(include = np.number)# List of numerical columnsdataRedColsList = dataRed.columns[...]# For all values in the numerical column list from abovefor i_col in dataRedColsList:# List of the values in i_col dataRed_i = dataRed.loc[:,i_col]# Define the 25th and 75th percentiles q25, q75 =round((dataRed_i.quantile(q =0.25)), 3), round((dataRed_i.quantile(q =0.75)), 3)# Define the interquartile range from the 25th and 75th percentiles defined above IQR =round((q75 - q25), 3)# Calculate the outlier cutoff cut_off = IQR *1.5# Define lower and upper cut-offs lower, upper =round((q25 - cut_off), 3), round((q75 + cut_off), 3)# Print the valuesprint(' ')# For each value of i_col, print the 25th and 75th percentiles and IQRprint(i_col, 'q25 =', q25, 'q75 =', q75, 'IQR =', IQR)# Print the lower and upper cut-offsprint('lower, upper:', lower, upper)# Count the number of outliers outside the (lower, upper) limits, print that valueprint('Number of Outliers: ', dataRed_i[(dataRed_i < lower) | (dataRed_i > upper)].count())# Missing valuesdiwali.isnull().sum()# Normality - qq plot# Change theme to "white"sns.set_style("white")# Make a copy of the data dataCopy = diwali.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Specify desired columni_col = dataCopyFin.Amount# Subplotsfig, (ax1, ax2) = plt.subplots(ncols =2, nrows =1)# Density plotsns.kdeplot(i_col, linewidth =5, ax = ax1)ax1.set_title('Amount spent (₹)')# Q-Q plotsm.qqplot(i_col, line ='s', ax = ax2)ax2.set_title('Amount spent Q-Q plot')plt.tight_layout()plt.show()