import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('mode.chained_assignment', None)

# read data from csv
county_data = pd.read_csv("https://raw.githubusercontent.com/Amgg12301/CMSC320-Final/main/county_complete.csv")
county_data


# select necessary columns and put into new dataframe
df = county_data[["fips", "state", "name", "median_household_income_2017", "computer_2017", "persons_per_household_2017", "hs_grad_2017", "bachelors_2017"]]
df


# calculate averages for each variable
temp_average_income = round(df['median_household_income_2017'].sum() / len(df['median_household_income_2017']), 0)
temp_average_computer = round(df['computer_2017'].sum() / len(df['computer_2017']), 0)
temp_average_bachelors = round(df['bachelors_2017'].sum() / len(df['bachelors_2017']), 0)
temp_average_hs_grad = round(df['hs_grad_2017'].sum() / len(df['hs_grad_2017']), 0)
temp_average_persons = round(df['persons_per_household_2017'].sum() / len(df['persons_per_household_2017']), 0)

# use appropriate average to replace each missing value
for index, row in df.iterrows():
    if np.isnan(row['median_household_income_2017']):
        df.loc[index, 'median_household_income_2017'] = temp_average_income

    if np.isnan(row['computer_2017']):
        df.loc[index, 'computer_2017'] = temp_average_computer

    if np.isnan(row['bachelors_2017']):
        df.loc[index, 'bachelors_2017'] = temp_average_bachelors

    if np.isnan(row['hs_grad_2017']):
        df.loc[index, 'hs_grad_2017'] = temp_average_hs_grad
    
    if np.isnan(row['persons_per_household_2017']):
        df.loc[index, 'persons_per_household_2017'] = temp_average_persons

# verify no missing values anymore
print(df['median_household_income_2017'].isnull().values.any())
print(df['computer_2017'].isnull().values.any())
print(df['bachelors_2017'].isnull().values.any())
print(df['hs_grad_2017'].isnull().values.any())
print(df['persons_per_household_2017'].isnull().values.any())

False
False
False
False
False


# educational achievement = average of high school and bachelors degree graduation rates
df['combined_hs_bachelors_2017'] = (df['hs_grad_2017'] + df['bachelors_2017']) / 2
df


# get list of states
states = df['state'].unique()
plt.figure(figsize=(15,10))

# create the scatter plot
# get average of factors for each state
# plot each state's averages and annotate accordingly
for state in states:
    average_income = df[df.state == state]['median_household_income_2017']
    average_income = average_income.sum() / len(average_income)

    average_education = df[df.state == state]['combined_hs_bachelors_2017']
    average_education = average_education.sum() / len(average_education)
    plt.scatter(average_income, average_education)
    plt.annotate(state, (average_income, average_education))

# plot details   
plt.title('Median Household Income vs. Average Percentage of High School & Bachelors Graduates for each U.S. State in 2017')
plt.xlabel('Median Household Income')
plt.ylabel('Percentage of High School & Bachelors Graduates')
plt.show()


plt.figure(figsize=(15,10))

for state in states:
    average_computer = df[df.state == state]['computer_2017']
    average_computer = average_computer.sum() / len(average_computer)

    average_education = df[df.state == state]['combined_hs_bachelors_2017']
    average_education = average_education.sum() / len(average_education)

    plt.scatter(average_computer, average_education)
    plt.annotate(state, (average_computer, average_education))

plt.title('Percentage of Households with Computer vs. Average Percentage of High School & Bachelors Graduates for each U.S. State in 2017')
plt.xlabel('Percentage of Households with Computer')
plt.ylabel('Percentage of Households with Computer & Bachelors Graduates')
plt.show()


plt.figure(figsize=(15,10))

for state in states:
    average_persons = df[df.state == state]['persons_per_household_2017']
    average_persons = average_persons.sum() / len(average_persons)

    average_education = df[df.state == state]['combined_hs_bachelors_2017']
    average_education = average_education.sum() / len(average_education)

    plt.scatter(average_persons, average_education)
    plt.annotate(state, (average_persons, average_education))

plt.title('Persons Per Household vs. Average Percentage of High School & Bachelors Graduates for each U.S. State in 2017')
plt.xlabel('Persons Per Household')
plt.ylabel('Percentage of High School & Bachelors Graduates')
plt.show()


from sklearn.linear_model import LinearRegression

# get data for Maryland specifically and plot
plt.figure(figsize=(15,10))
state = df[df.state == 'Maryland']
plt.scatter(state['median_household_income_2017'], state['combined_hs_bachelors_2017'])

# annotate for each county in Maryland on plot
for county in state['name']:
    x = state[state.name == county]['median_household_income_2017']
    y = state[state.name == county]['combined_hs_bachelors_2017']

    county = county.replace('County', '')
    plt.annotate(county, (x,y))

# create, fit, and visualize a linear regression line
regr = LinearRegression()
regr.fit(state['median_household_income_2017'].to_numpy().reshape(-1,1), state['combined_hs_bachelors_2017']) 
predictions = regr.predict(state['median_household_income_2017'].to_numpy().reshape(-1,1))
plt.plot(state['median_household_income_2017'], predictions, 'r')

plt.title('Median Household Income vs. Percentage of High School & Bachelors Graduates for Maryland in 2017')
plt.xlabel('Median Household Income')
plt.ylabel('Percentage of High School & Bachelors Graduates')
plt.show()


plt.figure(figsize=(15,10))
plt.scatter(state['computer_2017'], state['combined_hs_bachelors_2017'])

for county in state['name']:
    x = state[state.name == county]['computer_2017']
    y = state[state.name == county]['combined_hs_bachelors_2017']

    county = county.replace('County', '')
    plt.annotate(county, (x,y))

regr = LinearRegression()
regr.fit(state['computer_2017'].to_numpy().reshape(-1,1), state['combined_hs_bachelors_2017']) 
predictions = regr.predict(state['computer_2017'].to_numpy().reshape(-1,1))
plt.plot(state['computer_2017'], predictions, 'r')

plt.title('Percentage of Households with Computer vs. Percentage of High School & Bachelors Graduates for Maryland in 2017')
plt.xlabel('Percentage of Households with Computer')
plt.ylabel('Percentage of High School & Bachelors Graduates')
plt.show()


plt.figure(figsize=(15,10))
plt.scatter(state['persons_per_household_2017'], state['combined_hs_bachelors_2017'])

for county in state['name']:
    x = state[state.name == county]['persons_per_household_2017']
    y = state[state.name == county]['combined_hs_bachelors_2017']

    county = county.replace('County', '')
    plt.annotate(county, (x,y))

regr = LinearRegression()
regr.fit(state['persons_per_household_2017'].to_numpy().reshape(-1,1), state['combined_hs_bachelors_2017']) 
predictions = regr.predict(state['persons_per_household_2017'].to_numpy().reshape(-1,1))
plt.plot(state['persons_per_household_2017'], predictions, 'r')

plt.title('Persons Per Household vs. Percentage of High School & Bachelors Graduates for Maryland in 2017')
plt.xlabel('Persons Per Household')
plt.ylabel('Percentage of High School & Bachelors Graduates')
plt.show()


import statsmodels.api as sm

# hypothesis testing for median household income vs combined graduation rate to determine the strength of the fit
y = state['combined_hs_bachelors_2017']
x = state['median_household_income_2017']
x = sm.add_constant(x)

# fit data to model and get results
model = sm.OLS(y,x)
results = model.fit()
results.summary()


y = state['combined_hs_bachelors_2017']
x = state['computer_2017']
x = sm.add_constant(x)

model = sm.OLS(y,x)
results = model.fit()
results.summary()


y = state['combined_hs_bachelors_2017']
x = state['persons_per_household_2017']
x = sm.add_constant(x)

model = sm.OLS(y,x)
results = model.fit()
results.summary()


from sklearn import tree
from sklearn.model_selection import train_test_split

# create linear regression test to determine the strength of the fit compared to the previous hypothesis testing
lr = LinearRegression()

# split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(np.array(state['median_household_income_2017'].values), np.array(state['combined_hs_bachelors_2017'].values), test_size=0.25)

# fit the data to model
lr.fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))

# get predictions
y_train_pred = lr.predict(X_train.reshape(-1, 1))
y_test_pred = lr.predict(X_test.reshape(-1, 1))

# get r-squared values for training and testing datasets
train_tree = lr.score(X_train.reshape(-1, 1), y_train)
test_tree = lr.score(X_test.reshape(-1, 1), y_test)
train_tree, test_tree

(0.7089435214057382, 0.6553940941546065)


lr = LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(np.array(state['computer_2017'].values), np.array(state['combined_hs_bachelors_2017'].values), test_size=0.25)
lr.fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))

y_train_pred = lr.predict(X_train.reshape(-1, 1))
y_test_pred = lr.predict(X_test.reshape(-1, 1))

train_tree = lr.score(X_train.reshape(-1, 1), y_train)
test_tree = lr.score(X_test.reshape(-1, 1), y_test)
train_tree, test_tree

(0.7029996080872234, 0.6702080625700917)


lr = LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(np.array(state['persons_per_household_2017'].values), np.array(state['combined_hs_bachelors_2017'].values), test_size=0.25)
lr.fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))

y_train_pred = lr.predict(X_train.reshape(-1, 1))
y_test_pred = lr.predict(X_test.reshape(-1, 1))

train_tree = lr.score(X_train.reshape(-1, 1), y_train)
test_tree = lr.score(X_test.reshape(-1, 1), y_test)
train_tree, test_tree

(0.1959352455235044, 0.155132068546192)

	fips	state	name	pop2000	pop2010	pop2011	pop2012	pop2013	pop2014	pop2015	...	poverty_under_18_2019	two_plus_races_2019	unemployment_rate_2019	uninsured_2019	uninsured_65_and_older_2019	uninsured_under_19_2019	uninsured_under_6_2019	veterans_2019	white_2019	white_not_hispanic_2019
0	1001	Alabama	Autauga County	43671.0	54571	55199.0	54927.0	54695.0	54864.0	54838.0	...	23.2	2.2	3.5	7.1	0.0	1.7	1.7	12.6	76.8	74.6
1	1003	Alabama	Baldwin County	140415.0	182265	186534.0	190048.0	194736.0	199064.0	202863.0	...	13.4	1.7	4.0	8.9	0.3	3.8	2.2	11.8	86.2	83.1
2	1005	Alabama	Barbour County	29038.0	27457	27351.0	27175.0	26947.0	26749.0	26264.0	...	50.1	1.2	9.4	11.3	0.3	3.3	3.4	6.6	46.8	45.8
3	1007	Alabama	Bibb County	20826.0	22915	22745.0	22658.0	22503.0	22533.0	22561.0	...	NaN	0.6	7.0	10.7	0.0	2.0	4.5	8.0	76.8	74.5
4	1009	Alabama	Blount County	51024.0	57322	57562.0	57595.0	57623.0	57546.0	57590.0	...	18.4	1.6	3.1	10.8	0.2	5.9	6.1	7.7	95.5	86.9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3137	56037	Wyoming	Sweetwater County	37613.0	43806	44013.0	45042.0	45145.0	44981.0	44732.0	...	NaN	2.3	5.7	11.3	0.5	9.2	13.8	8.6	93.4	79.6
3138	56039	Wyoming	Teton County	18251.0	21294	21476.0	21709.0	22326.0	22817.0	23029.0	...	NaN	0.7	0.7	12.7	0.0	10.1	5.9	5.3	89.3	81.3
3139	56041	Wyoming	Uinta County	19742.0	21118	20899.0	20999.0	20960.0	20845.0	20780.0	...	NaN	3.5	5.5	11.2	0.6	6.8	1.8	7.4	93.4	87.5
3140	56043	Wyoming	Washakie County	8289.0	8533	8460.0	8421.0	8427.0	8288.0	8296.0	...	NaN	3.8	4.1	15.0	1.5	7.0	7.8	11.9	89.7	81.9
3141	56045	Wyoming	Weston County	6644.0	7208	7141.0	7074.0	7136.0	7142.0	7181.0	...	NaN	1.3	4.0	11.8	0.0	8.6	7.1	10.3	97.4	96.4

	fips	state	name	median_household_income_2017	computer_2017	persons_per_household_2017	hs_grad_2017	bachelors_2017
0	1001	Alabama	Autauga County	55317.0	86.2	2.59	87.7	25.0
1	1003	Alabama	Baldwin County	52562.0	86.9	2.63	90.2	30.7
2	1005	Alabama	Barbour County	33368.0	73.4	2.54	73.1	12.0
3	1007	Alabama	Bibb County	43404.0	74.8	2.97	82.1	13.2
4	1009	Alabama	Blount County	47412.0	78.2	2.76	79.8	13.1
...	...	...	...	...	...	...	...	...
3137	56037	Wyoming	Sweetwater County	71083.0	92.1	2.70	91.3	22.2
3138	56039	Wyoming	Teton County	80049.0	95.4	2.51	95.1	54.1
3139	56041	Wyoming	Uinta County	54672.0	91.2	2.66	91.8	17.4
3140	56043	Wyoming	Washakie County	51362.0	87.7	2.32	88.5	21.0
3141	56045	Wyoming	Weston County	59605.0	82.9	2.12	91.9	19.8

	fips	state	name	median_household_income_2017	computer_2017	persons_per_household_2017	hs_grad_2017	bachelors_2017	combined_hs_bachelors_2017
0	1001	Alabama	Autauga County	55317.0	86.2	2.59	87.7	25.0	56.35
1	1003	Alabama	Baldwin County	52562.0	86.9	2.63	90.2	30.7	60.45
2	1005	Alabama	Barbour County	33368.0	73.4	2.54	73.1	12.0	42.55
3	1007	Alabama	Bibb County	43404.0	74.8	2.97	82.1	13.2	47.65
4	1009	Alabama	Blount County	47412.0	78.2	2.76	79.8	13.1	46.45
...	...	...	...	...	...	...	...	...	...
3137	56037	Wyoming	Sweetwater County	71083.0	92.1	2.70	91.3	22.2	56.75
3138	56039	Wyoming	Teton County	80049.0	95.4	2.51	95.1	54.1	74.60
3139	56041	Wyoming	Uinta County	54672.0	91.2	2.66	91.8	17.4	54.60
3140	56043	Wyoming	Washakie County	51362.0	87.7	2.32	88.5	21.0	54.75
3141	56045	Wyoming	Weston County	59605.0	82.9	2.12	91.9	19.8	55.85

Dep. Variable:	combined_hs_bachelors_2017	R-squared:	0.706
Model:	OLS	Adj. R-squared:	0.692
Method:	Least Squares	F-statistic:	52.70
Date:	Sun, 16 May 2021	Prob (F-statistic):	2.85e-07
Time:	20:05:47	Log-Likelihood:	-65.581
No. Observations:	24	AIC:	135.2
Df Residuals:	22	BIC:	137.5
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	40.8340	2.817	14.494	0.000	34.991	46.677
median_household_income_2017	0.0003	3.73e-05	7.260	0.000	0.000	0.000

Do household factors significantly impact educational achievement?¶

Amogh Giri, Ian Yeh, Rivke Weingarten¶

Introduction¶

Data Source¶

Libraries Used¶

Data Collection¶

Data Analysis & Visualization¶

Hypothesis Testing & Machine Learning¶

Conclusion¶

Omnibus:	4.438	Durbin-Watson:	1.362
Prob(Omnibus):	0.109	Jarque-Bera (JB):	1.550
Skew:	0.041	Prob(JB):	0.461
Kurtosis:	1.758	Cond. No.	2.69e+05

Omnibus:	0.339	Durbin-Watson:	1.529
Prob(Omnibus):	0.844	Jarque-Bera (JB):	0.503
Skew:	0.161	Prob(JB):	0.778
Kurtosis:	2.369	Cond. No.	1.55e+03

Omnibus:	1.273	Durbin-Watson:	2.086
Prob(Omnibus):	0.529	Jarque-Bera (JB):	0.789
Skew:	0.442	Prob(JB):	0.674
Kurtosis:	2.906	Cond. No.	43.9

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-41.4921	13.894	-2.986	0.007	-70.307	-12.677
computer_2017	1.1665	0.159	7.350	0.000	0.837	1.496

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	15.9221	19.207	0.829	0.416	-23.911	55.755
persons_per_household_2017	17.0973	7.356	2.324	0.030	1.842	32.353