This lesson we will cover following:
The will use following data set fo US Atlantic regions (expcept Florida) which could be avilable fro download from here.
Age Adjusted Lung & Bronchus Cancer Mortality Rates from 1980 t0 2014 by county was obtained from Institute for Health Metrics and Evaluation (IHME). The rate was estimated using spatially explicit Bayesian mixed effects regression models for both males and females (Mokdad et al., 2016). The model incorporated 7 covariates, such as proportion of the adult population that graduated high school, the proportion of the population that is Hispanic, the proportion of the population that is black, the proportion of the population that is a race other than black or white, The proportion of a county that is contained within a state or federal Native American reservation, the median household income, and the population density. To reduce the noise due to low mortality rate in a small population size, we smoothed data with the Empirical Bayesian smoother, a tool for dealing with rate instability associated with small sample sizes. It takes the population in the region as an indicator of confidence in the data, with higher population providing a higher confidence in the value at that location. We used SpaceStat (Evaluation version) Software for smoothing data.
Age-standardized cigarette smoking (%) prevalence by County estimated from the Behavioral Risk Factor Surveillance System (BRFSS) survey data (Dwyer-Lindgren et al., 2014). The prevalence (%) were estimated by logistic hierarchical mixed effects regression models, stratified by sex. All estimates were age-standardized following the age structure of the 2000 census. The uncertainty of the prevalence estimates was assessed using simulation methods.
County-level poverty data (% population below poverty level) are from US Census Small Area Income and Poverty Estimates (SAIPE) Program. Income and poverty rate were estimated by combining survey data with population estimates and administrative records.
Annual mean PM2.5 from 1998 to 2012 were modeled from aerosol optical depth from multiple satellite (Multiangle Imaging Spectra Radiometer, MODIS Dark Target, MODIS and SeaWiFS Deep Blue, MODIS MAIAC) products and validated by ground-based PM2.5 monitoring sites (Aaron van Donkelaar et al., 2016). All raster grid of PM2.5 were re-sampled at 2.5 km x 2 km grid size using Empirical Bayesian Kriging in ArcPython - Geo-statistical Tool (ESRI, 2017)). County population weighted mean for each year were calculated from the population data.
Annual mean ambient NO2 Concentrations estimated from three satellite instruments, such as Global Ozone Monitoring Experiment (GOME), Scanning Imaging Absorption Spectrometer for Atmospheric Chartography (SCIAMACHY) and GOME-2 satellite in combination with chemical transport model (Gedders et al., (2016). All raster grid of PM2.5 were re-sampled at 2.5 km x 2 km grid size using Empirical Bayesian Kriging in ArcPython - Geo-statistical Tool (ESRI, 2017)). County population weighted mean for each year were calculated from the population data.
Annual mean ambient SO2 Concentrations estimated were obtained from time series Multi-source SO2 emission retrievals and consistency of satellite and surface measurements of (Fioletov et al,(2017). All raster grid of PM2.5 were re-sampled at 2.5 km x 2.5 km grid size using Empirical Bayesian Kriging in ArcPython - Geo-statistical Tool (ESRI, 2017)). County population weighted mean for each year were calculated from the population data.
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.colors import ListedColormap, BoundaryNorm
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from mpl_toolkits.mplot3d import Axes3D # noqa: F401 unused import
from mpl_toolkits.axes_grid1 import make_axes_locatable
import mapclassify
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
from glob import glob
from functools import reduce
from osgeo import ogr, gdal
from osgeo import gdalconst
import geopandas as gpd
import pycrs
import fiona
from fiona.crs import from_epsg
from shapely.geometry import box
from shapely.geometry import Point
import shapely.geometry as geom
import json
from scipy.stats import sem
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.api import abline_plot
from statsmodels import graphics
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
%matplotlib inline
# What version of Python do you have?
print(f"Python {sys.version}")
print(f"geopandas {gpd.__version__}")
print(f"osgeo {gdal.__version__}")
import tensorflow.keras
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")
sns.set_style("white")
sns.set(font_scale=1.5)
path= "E:/GitHub/geospatial-python-github.io/Lesson_05_working_with_polygon_data"
os.chdir(path)
print("Current Working Directory " , os.getcwd())
cancer=pd.read_csv("Data_05/Lung_cancer_1998_2012.csv")
smoking=pd.read_csv("Data_05/SMOKING_1998_2012.csv")
poverty=pd.read_csv("Data_05/POVERTY_1998_2012.csv")
PM25=pd.read_csv("Data_05/PM25_1998_2012.csv")
NO2=pd.read_csv("Data_05/NO2_1998_2012.csv")
SO2=pd.read_csv("Data_05/SO2_1998_2012.csv")
Load County shape file and Extract Centriod
county_gpd=gpd.read_file('Data_05/county_map.shp')
county_gpd.plot()
county_gpd.head()
Now, we will get centriod of each county
# copy poly to new GeoDataFrame
cen_points = county_gpd.copy()
# change the geometry
cen_points.geometry = cen_points['geometry'].centroid
# same crs
cen_points.crs =county_gpd.crs
cen_points.head()
fig, ay = plt.subplots()
county_gpd.plot(ax=ay, color="white", edgecolor="gray")
cen_points.plot(ax=ay, color='red', markersize=1)
ay.set_title('County Centriods')
# Get X (m) and Y (m) Values from geometry column
cen_points['X'] = cen_points.geometry.x
cen_points['Y'] = cen_points.geometry.y
cen_points.head()
### Convert geopandas data frame to pandas data frame without geometry column
columns =['STATE','COUNTY','REGION','DIVISION','geometry' ]
cen_df = pd.DataFrame(cen_points.drop(columns=columns))
cen_df.head()
We will create a data frame of 15 years (1998-2012) mean all variables, and will use this data in do some statistical analysis.
# copy cen_df as "avg" pd-dataframe
avg_df = cen_df.copy()
avg_df["cancer"]= cancer.iloc[:,2:16].mean(axis=1)
avg_df["poverty"]= poverty.iloc[:,2:16].mean(axis=1)
avg_df["smoking"]= smoking.iloc[:,2:16].mean(axis=1)
avg_df["PM25"]= PM25.iloc[:,2:16].mean(axis=1)
avg_df["NO2"]= NO2.iloc[:,2:16].mean(axis=1)
avg_df["SO2"]= SO2.iloc[:,2:16].mean(axis=1)
avg_df.head()
# Write as CSV file
avg_df.to_csv('Data_05/county_mean.csv',index=False)
# Drop code coloun for pd-data frame
avg_mf=avg_df.drop("code", axis=1)
county_gpd_data = county_gpd.merge(avg_mf, on='FIPS')
county_gpd_data.head()
county_gpd_data.to_file("Data_05/county_mean_data.shp")
# Make subplots that are next to each other
fig, (bx1, bx2, bx3) = plt.subplots(nrows=1, ncols=3, figsize=(12, 5))
# Plot Cancer
county_gpd_data.plot(column='cancer',
edgecolor="gray",
ax=bx1,
cmap='Blues',
linewidth=0.5
)
bx1.set_title("LBC Mortality Rate");
bx1.axis('off');
fig.colorbar(bx1.collections[0], ax=bx1)
# Plot Poverty
county_gpd_data.plot(column='poverty',
edgecolor="gray",
ax=bx2,
cmap= 'Blues',
linewidth=0.5
);
bx2.set_title("Povert (%)");
bx2.axis('off');
fig.colorbar(bx2.collections[0], ax=bx2)
# Plot Smoking
county_gpd_data.plot(column='smoking',
edgecolor="gray",
ax=bx3,
cmap='Blues',
linewidth=0.5
);
bx3.set_title("Smoking (%)");
bx3.axis('off');
fig.colorbar(bx3.collections[0], ax=bx3)
# Make subplots that are next to each other
fig, (bx1, bx2, bx3) = plt.subplots(nrows=1, ncols=3, figsize=(12, 5))
# Plot PM25
county_gpd_data.plot(column='PM25',
edgecolor="gray",
ax=bx1,
cmap='Blues',
linewidth=0.5,
)
bx1.set_title("PM25");
bx1.axis('off');
fig.colorbar(bx1.collections[0], ax=bx1)
# Plot NO2
county_gpd_data.plot(column='NO2',
edgecolor="gray",
ax=bx2,
cmap= 'Blues',
linewidth=0.5,
);
bx2.set_title("NO2");
bx2.axis('off');
fig.colorbar(bx2.collections[0], ax=bx2)
# Plot SO2
county_gpd_data.plot(column='SO2',
edgecolor="gray",
ax=bx3,
cmap='Blues',
linewidth=0.5,
);
bx3.set_title("SO2");
bx3.axis('off');
fig.colorbar(bx3.collections[0], ax=bx3)
Once you start using them, for() loops are fairly simple. The syntax of our for() loop will say this: for each year in the list_of_years list, run the following code. And when all years in our list have gone through the code, stop the loop.
county_gpd=gpd.read_file('Data_05/county_map.shp')
cancer_df=pd.read_csv("Data_05/Lung_cancer_1998_2012.csv")
cancer_df.describe()
cancer_gpd = county_gpd.merge(cancer_df, on='FIPS')
# save all the maps in the charts folder
output_path = 'Data_05/Maps'
# counter for the for loop
i = 0
# list of years (which are the column names at the moment)
list_of_years = ['1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008','2010','2011', '2012']
# set the min and max range for the choropleth map
vmin, vmax = 30, 130
# start the for loop to create one map per year
for year in list_of_years:
# create map, UDPATE: added plt.Normalize to keep the legend range the same for all maps
fig = cancer_gpd.plot(column=year,
cmap='Blues',
figsize=(6,5),
linewidth=0.5,
edgecolor='0.8',
vmin=vmin,
vmax=vmax,
legend=True, norm=plt.Normalize(vmin=vmin, vmax=vmax))
# remove axis of chart
fig.axis('off')
# add a title
fig.set_title('LBC Mortality Rate', \
fontdict={'fontsize': '20',
'fontweight' : '3'})
# this will save the figure as a high-res png in the output path. you can also save as svg if you prefer.
filepath = os.path.join(output_path, year+'_LBC.png')
chart = fig.get_figure()
chart.savefig(filepath, dpi=300)
# position the annotation to the bottom left
fig.annotate(year,
xy=(0.1, .225), xycoords='figure fraction',
horizontalalignment='left', verticalalignment='top',
fontsize=15)
We use will county mean data (avg) to calculate state mean and standard deviation of LBC mortality rates and all other covariates.
# Mean
state_mean=avg_df.groupby(['code'])[['cancer','smoking','poverty','PM25']].mean()
state_mean.head()
# Standard deviation
state_std=avg_df.groupby(['code'])[['cancer','smoking','poverty','PM25']].std()
state_std.head()
Load state shape file and join the mean and standard deviation
state_gpd= gpd.read_file('Data_05/state_map.shp')
state_gpd_mean = state_gpd.merge(state_mean, on='code')
state_gpd_mean.head()
state_gpd=gpd.read_file('Data_05/state_map.shp')
state_gpd_std = state_gpd.merge(state_std, on='code')
state_gpd_std.head()
# Make subplots that are next to each other
fig, (bx1, bx2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
# Plot mean data
state_gpd_mean.plot(column='cancer',
edgecolor="gray",
ax=bx1,
cmap= 'Blues',
linewidth=0.5
);
bx1.set_title("Mean LBC Rate");
bx1.axis('off');
fig.colorbar(bx1.collections[0], ax=bx1)
# Plot Standard deviation
state_gpd_std.plot(column='cancer',
edgecolor="gray",
ax=bx2,
cmap= 'Blues',
linewidth=0.5
);
bx2.set_title("STDEV LBC Rate");
bx2.axis('off');
fig.colorbar(bx2.collections[0], ax=bx2)
Plotly's Python graphing library makes interactive, publication-quality graphs. Examples of how to make line plots, scatter plots, area charts, bar charts, error bars, box plots, histograms, heatmaps, subplots, multiple-axes, polar charts, and bubble charts.
fig = go.Figure(data=go.Choropleth(
locations=state_gpd_mean['code'], # Spatial coordinates
z = state_gpd_mean['cancer'].astype(float), # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'Reds',
colorbar_title = "Death (per 100K)",
))
fig.update_layout(
title_text = 'Mean LBC Mortality Rate by Selected State',
geo_scope='usa', # limite map scope to USA
)
fig.show()
Making choropleth maps from your gespatial data requires two main types of input: GeoJSON-formatted geometry information where each feature has an id and a list of values indexed by feature id. The GeoJSON data is passed to the geojson attribute, and the data is passed into the z (color for px.choropleth_mapbox) attribute, in the same order as the IDs are passed into the location attribute.
mf = pd.read_csv("Data_05/county_mean.csv", dtype={"FIPS": str})
mf.head()
with open('Data_05/county_gcs.json') as read_file:
counties = json.load(read_file)
counties["features"][0]
fig = px.choropleth_mapbox(mf, geojson=counties, locations='FIDs', color='cancer',
color_continuous_scale="Viridis",
range_color=(0, 120),
mapbox_style="carto-positron",
zoom=3, center = {"lat": 45.0902, "lon": -75.7129},
opacity=0.4,
labels={'cancer':'LBC mortality rate'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
In statistics, exploratory data analysis (EDA) is an approach to analyzing data sets to summarize their main characteristics, often with visual methods. A statistical model can be used or not, but primarily EDA is for seeing what the data can tell us beyond the formal modeling or hypothesis testing task (Wikipedia, 2020).
# convert to panad data frame
df=pd.DataFrame(county_gpd_data)
df.head()
mf=df[['cancer', 'poverty','smoking', 'PM25', 'NO2', 'SO2']]
mf.describe()
ax = sns.boxplot(x="code", y="cancer", data=df)
# Usual boxplot
ax = sns.boxplot(x='REGION', y='cancer', data=df)
# Add jitter with the swarmplot function.
ax = sns.swarmplot(x='REGION', y='cancer', data=df, color="grey")
Plotly Express is the easy-to-use, high-level interface to Plotly, which operates on "tidy" data. In a box plot created by px.box, the distribution of the column given as y argument is represented.
fig = px.box(df, x="code", y="cancer")
fig.show()
mf.corr()
corrmat = mf.corr()
f, ax = plt.subplots(figsize =(9, 8))
sns.heatmap(corrmat, ax = ax, cmap ="YlGnBu", linewidths = 0.1)
cg = sns.clustermap(corrmat, cmap ="YlGnBu", linewidths = 0.1);
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation = 0)
cg
pd.plotting.scatter_matrix(df[['cancer','poverty','smoking','PM25']], alpha=0.2, figsize=(10, 10))
plt.show()
Seaborn allows to make a correlogram or correlation matrix really easily. Correlogram are awesome for exploratory analysis: it allows to quickly observe the relationship between every variable of your matrix. It is easy to do it with seaborn: just call the pairplot function
sns.pairplot(df[['cancer','poverty','smoking','PM25']], diag_kind="kde")
df.plot(x='smoking', y='cancer', style ='o')
px.scatter_3d(df, x='smoking', y='poverty', z='cancer',
color='cancer',
size='cancer', size_max=18,
opacity=0.7)
A marginal plot allows to study the relationship between 2 numeric variables. The central chart display their correlation. It is usually a scatterplot, a hexbin plot, a 2D histogram or a 2D density plot. The marginal charts, usually at the top and at the right, show the distribution of the 2 variables using histogram or density plot.
sns.jointplot(x=df["smoking"], y=df["cancer"], kind='scatter')
sns.jointplot(x=df["smoking"], y=df["cancer"], kind='hex')
sns.jointplot(x=df["smoking"], y=df["cancer"], kind='kde')
First we will use "scikit-learn", a machine learning library for the Python programming language.It features various classification, regression and clustering algorithms including support vector machines, random forests, gradient boosting, k-means and DBSCAN, and is designed to interoperate with the Python numerical and scientific libraries NumPy and SciPy (Wikipedia).
X = df['smoking'].values.reshape(-1,1)
y = df['cancer'].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
ols = LinearRegression()
ols=ols.fit(X_train, y_train) #training the algorithm
print(ols.intercept_)
print(ols.coef_)
y_train_pred=ols.predict(X_train)
train_pred= pd.DataFrame({'Actual': y_train.flatten(), 'Predicted': y_train_pred.flatten()})
train_pred['residuals']=train_pred['Actual'] - train_pred['Predicted']
train_pred
Now that we have trained our algorithm, it’s time to make some predictions. To do so, we will use our test data and see how accurately our algorithm predicts the percentage score.
y_test_pred = ols.predict(X_test)
Now compare the actual output values for X_test with the predicted values, execute the following script:
test_pred = pd.DataFrame({'Observed': y_test.flatten(), 'Predicted': y_test_pred.flatten()})
test_pred.head()
plt.figure(figsize=(6, 6))
ax = sns.regplot(x="Observed", y="Predicted", data=test_pred )
ax.set_ylim(30,120)
ax.set_xlim(30,120)
ax.plot([30, 120], [30, 120], 'k--', lw=2)
plt.title('OLS Predicted LBC Mortality Rate',fontsize=18)
Now we evaluate the performance of the algorithm. or regression algorithms, three evaluation metrics are commonly used:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_test_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_test_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
print('R2:', metrics.r2_score(y_test, y_test_pred))
Xm=df[['poverty','smoking','PM25','NO2','SO2']].values
y = df[['cancer']].values
Xm_train, Xm_test, y_train, y_test = train_test_split(Xm, y, test_size=0.2, random_state=0)
mlr = LinearRegression()
mlr.fit(Xm_train, y_train)
pred_mlr = pd.DataFrame(['poverty','smoking','PM25','NO2','SO2'])
coeff = pd.DataFrame(mlr.coef_, index=['Co-efficient']).transpose()
pd.concat([pred_mlr,coeff], axis=1, join='inner')
Now let's do prediction on test data.
y_mlr_pred = mlr.predict(Xm_test)
mlr_test_pred = pd.DataFrame({'Observed': y_test.flatten(), 'Predicted': y_mlr_pred.flatten()})
mlr_test_pred.head()
plt.figure(figsize=(6, 6))
ax = sns.regplot(x="Observed", y="Predicted", data=mlr_test_pred)
ax.set_ylim(30,120)
ax.set_xlim(30,120)
ax.plot([30, 120], [30, 120], 'k--', lw=2)
plt.title('MLR Predicted LBC Mortality Rate',fontsize=18)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_mlr_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_mlr_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_mlr_pred)))
In K-Folds Cross Validation data is splitted into k different subsets (or folds). We use k-1 subsets to train our data and leave the last subset (or the last fold) as test data. We then average the model against each of the folds and then finalize our model. After that we test it against the test set. We will use entire data set (n=666) in this tutorial.
cv_Kfold = KFold(n_splits=10, random_state=1, shuffle=True)
for i in range(1,11):
mlr = LinearRegression()
mlr.fit(Xm, y)
cv_scores_Kfold = cross_val_score(mlr, Xm, y, scoring="neg_mean_squared_error", cv=cv_Kfold, n_jobs=1)
all_scores = pd.DataFrame(cross_validate(mlr, Xm, y, scoring=['r2', 'neg_mean_absolute_error'], n_jobs=-1, verbose=0))
print(all_scores)
# mean and STD
print("Folds: " + str(len(cv_scores_Kfold)) + ", MSE: " + str(np.mean(np.abs(cv_scores_Kfold))) + ", STD: " + str(np.std(cv_scores_Kfold)))
y_pred_KFold = cross_val_predict(mlr, Xm, y, cv=10)
Kfold_pred = pd.DataFrame({'Observed': y.flatten(), 'Predicted': y_pred_KFold.flatten()})
Kfold_pred.head()
plt.figure(figsize=(6, 6))
ax = sns.regplot(x="Observed", y="Predicted", data=Kfold_pred)
ax.set_ylim(30,130)
ax.set_xlim(30,130)
ax.plot([30, 130], [30, 130], 'k--', lw=2)
plt.title('K-Fold Cross-Validation',fontsize=18)
print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_pred_KFold))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_pred_KFold))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred_KFold)))
print('R2:', metrics.r2_score(y, y_pred_KFold))
In Leave One Out Cross Validation, the number of folds (subsets) equals to the number of observations. w
cv_LOV = KFold(n_splits=666, random_state=None, shuffle=False)
cv_scores_LOV = cross_val_score(mlr, Xm, y, scoring="neg_mean_squared_error", cv=cv_LOV, n_jobs=1)
print("Folds: " + str(len(cv_scores_LOV)) + ", MSE: " + str(np.mean(np.abs(cv_scores_LOV))) + ", STD: " + str(np.std(cv_scores_LOV)))
y_pred_LOV = cross_val_predict(mlr, Xm, y, cv=10)
LOV_pred = pd.DataFrame({'Observed': y.flatten(), 'Predicted': y_pred_LOV.flatten()})
LOV_pred.head()
plt.figure(figsize=(6, 6))
ax = sns.regplot(x="Observed", y="Predicted", data=LOV_pred)
ax.set_ylim(30,130)
ax.set_xlim(30,130)
ax.plot([30, 130], [30, 130], 'k--', lw=2)
plt.title('LOV Cross-Validation',fontsize=18)
print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_pred_LOV))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_pred_LOV))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred_LOV)))
print('R2:', metrics.r2_score(y, y_pred_LOV))
You can use R-style formulas together with pandas data frames to fit your models. Here is a simple example using ordinary least squares:
ols_results = smf.ols('cancer ~ smoking', data=df).fit()
print(ols_results.summary())
mlr_results = smf.ols('cancer ~ smoking + poverty + PM25 + NO2 + SO2', data=df).fit()
print(mlr_results.summary())
In statistics, the generalized linear model (GLM) is a flexible generalization of ordinary linear regression that allows for response variables that have error distribution models other than a normal distribution. The GLM generalizes linear regression by allowing the linear model to be related to the response variable via a link function and by allowing the magnitude of the variance of each measurement to be a function of its predicted value (Wikipedia, 2019).
The distribution families currently implemented in statsmodels are:
In this excercise I will show you how to fit GML models with Gaussian and Poisson famlies.
If outcome is continuous and unbounded, then the most "default" choice is the Gaussian distribution (a.k.a. normal distribution), i.e. the standard linear regression (unless you use other link function then the default identity link)(Source).
glm_gaussian = sm.formula.glm('cancer ~ smoking + poverty + PM25 + NO2 + SO2',
family=sm.families.Gaussian(), data=df).fit()
print(glm_gaussian.summary())
print('Parameters: ', glm_gaussian.params)
print('T-values: ', glm_gaussian.tvalues)
y = df['cancer']
y_glm_gaussian =glm_gaussian.predict()
GLM_gaussian_pred = pd.DataFrame({'Observed': y, 'Predicted': y_glm_gaussian})
GLM_gaussian_pred.head()
plt.figure(figsize=(6, 6))
ax = sns.regplot(x="Observed", y="Predicted", data=GLM_gaussian_pred)
ax.set_ylim(30,130)
ax.set_xlim(30,130)
ax.plot([30, 130], [30, 130], 'k--', lw=2)
plt.title('GLM-Gaussian',fontsize=18)
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(y_glm_gaussian, glm_gaussian.resid_pearson)
ax.hlines(0, 30, 100)
ax.set_xlim(30, 100)
ax.set_title('Residual Dependence Plot')
ax.set_ylabel('Pearson Residuals')
ax.set_xlabel('Fitted values')
from scipy import stats
fig, ax = plt.subplots(figsize=(5,5))
resid = glm_gaussian.resid_deviance.copy() # residuals
resid_std = stats.zscore(resid)
ax.hist(resid_std, bins=25)
ax.set_title('Histogram of standardized deviance residuals');
graphics.gofplots.qqplot(resid, line='r')
I outcome is discrete, or more precisely, you are dealing with counts (how many times something happen in given time interval), then the most common choice of the distribution to start with is Poisson distribution. The problem with Poisson distribution is that it is rather inflexible in the fact that it assumes that mean is equal to variance, if this assumption is not met, you may consider using quasi-Poisson family, or negative binomial distribution (see also Definition of dispersion parameter for quasipoisson family)(Source).
Before fiiting Poisson model we convert caner (float64) values to discrite.
df['cancer_int'] = df['cancer'].astype('int64')
df.head()
glm_poisson = sm.formula.glm('cancer_int ~ smoking + poverty + PM25 + NO2 + SO2',
family=sm.families.Poisson(), data=df).fit()
print(glm_poisson.summary())
y = df['cancer']
y_glm_poisson =glm_gaussian.predict()
GLM_poisson_pred = pd.DataFrame({'Observed': y, 'Predicted': y_glm_poisson})
GLM_poisson_pred.head()
plt.figure(figsize=(6, 6))
ax = sns.regplot(x="Observed", y="Predicted", data=GLM_gaussian_pred)
ax.set_ylim(30,130)
ax.set_xlim(30,130)
ax.plot([30, 130], [30, 130], 'k--', lw=2)
plt.title('GLM-Gaussian',fontsize=18)
As egression is a type of supervised machine learning algorithm, it usually use to predict a continuous data and produce a model that represents the ‘best fit’ to some observed data, according to an evaluation criterion.
The Keras library is a high-level API that runs on top of TensorFlow for building deep learning models. Often, building a very complex deep learning network with Keras can be achieved with only a few lines of code.
In this exercise I will use Tensorflow 2.0 GUP to develop a simple regression model neural network architecture. For installtion of TensorFlow GUP in Python 3.7, please see Jeff Heaton very use full github site. I follow mostly his class website for regression analysis with Keras.
mf=pd.read_csv("Data_05/county_mean.csv")
df=mf[["cancer",'smoking','poverty','PM25','NO2','SO2']]
df.head()
It is good practice to normalize or standardize features that use different scales and ranges. Although the model might converge without feature normalization, it makes training more difficult, and it makes the resulting model dependent on the choice of units used in the input.
df['poverty'] = zscore(df['poverty'])
df['smoking'] = zscore(df['smoking'])
df['PM25'] = zscore(df['PM25'])
df['NO2'] = zscore(df['NO2'])
df['SO2'] = zscore(df['SO2'])
df.head()
Separate the target value, or "y", from the features. This label is the value that you will train the model to predict.
x_columns=['smoking','poverty','PM25','NO2','SO2']
x = df[x_columns].values
y = df['cancer'].values
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=42)
from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Activation from tensorflow.keras.callbacks import EarlyStopping
model = Sequential()
model.add(Dense(25, input_dim=x.shape[1], activation='relu')) # Hidden 1
model.add(Dense(10, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3,
patience=5, verbose=1, mode='auto', restore_best_weights=True)
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=1000)
DNN_pred = model.predict(x_test)
DNN_Pred_df = pd.DataFrame({'Observed': y_test.flatten(), 'Predicted': DNN_pred.flatten()})
DNN_Pred_df.head()
plt.figure(figsize=(6, 6))
ax = sns.regplot(x="Observed", y="Predicted", data=DNN_Pred_df)
ax.set_ylim(30,130)
ax.set_xlim(30,130)
ax.plot([30, 130], [30, 130], 'k--', lw=2)
plt.title('Neural Network Regression',fontsize=18)
# Measure MSE error.
score_mse = metrics.mean_squared_error(DNN_pred,y_test)
print("Final score (MSE): {}".format(score_mse))
import numpy as np
# Measure RMSE error. RMSE is common for regression.
score_rmse = np.sqrt(metrics.mean_squared_error(DNN_pred,y_test))
print("Final score (RMSE): {}".format(score_rmse))
To generate a lift chart, perform the following activities:
Reading a lift chart:
# Regression chart.
def chart_regression(pred, y, sort=True):
t = pd.DataFrame({'pred': pred, 'y': y.flatten()})
if sort:
t.sort_values(by=['y'], inplace=True)
plt.plot(t['y'].tolist(), label='expected')
plt.plot(t['pred'].tolist(), label='prediction')
plt.ylabel('output')
plt.legend()
plt.show()
# Plot the chart
chart_regression(DNN_pred.flatten(),y_test)