- GapMinder is a teaching project that identifies systematic misconceptions about important global trends and proportions and uses reliable data to develop easy to understand teaching materials to rid people of their misconceptions.
- Compiles and makes available many useful cross-country data sources
- Free and easy to access (once you understand how)
- Lot's of variables are available, from multiple sources covering the period after 1800.
Setup¶
Import Modules and set Paths¶
# Basic Packages
from __future__ import division
import os
from datetime import datetime
# Web & file access
import requests
import io
# Import display options for showing websites
from IPython.display import IFrame, HTML
# Data
import pandas as pd
import numpy as np
from pandas_datareader import data, wb
# GIS & maps
import geopandas as gpd
gp = gpd
import georasters as gr
import geoplot as gplt
import geoplot.crs as gcrs
import mapclassify as mc
import textwrap
# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%pylab --no-import-all
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
import plotly.express as px
import plotly.graph_objects as go
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
# Next line can import all of plotnine, but may overwrite things? Better import each function/object you need
#from plotnine import *
Using matplotlib backend: <object object at 0x152374fa0> %pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
# Data Munging
from itertools import product, combinations
import difflib
import pycountry
import geocoder
from geonamescache.mappers import country
mapper = country(from_key='name', to_key='iso3')
mapper2 = country(from_key='iso3', to_key='iso')
mapper3 = country(from_key='iso3', to_key='name')
# Regressions & Stats
from scipy.stats import norm
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer, LineLocation
# Paths
pathout = './data/'
if not os.path.exists(pathout):
os.mkdir(pathout)
pathgraphs = './graphs/'
if not os.path.exists(pathgraphs):
os.mkdir(pathgraphs)
currentYear = datetime.now().year
year = min(2020, currentYear-2)
Getting data from GapMinder¶
There are two ways of getting data from GapMinder:
Use GapMinder Data Website and select a series of interest and download it as a CSV or Excel file.
Download the series of interest from GapMinder's Github reporsitories:
- Systema Globalis (indicators inherited from Gapminder World, many are still updated)
- Fast Track (indicators they compile manually)
- World Development Indicators (WDI) (direct copy from World Bank repository)
Below we will access GapMinder's data via Github, since it is much easier and efficient.
So, we can get data from their GitHub site.
Two approaches:
- Clone repository and process (we could process everything to create one giant database)
- Access specific files we are interested in
Here we'll follow approach 2
Let's start by getting country names, codes, etc.¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/'
file = 'ddf--entities--geo--country.csv'
countries_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
countries_gm.head()
country | g77_and_oecd_countries | income_3groups | income_groups | is--country | iso3166_1_alpha2 | iso3166_1_alpha3 | iso3166_1_numeric | iso3166_2 | landlocked | ... | name | un_sdg_ldc | un_sdg_region | un_state | unhcr_region | unicef_region | unicode_region_subtag | west_and_rest | world_4region | world_6region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | abkh | others | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | ... | Abkhazia | NaN | NaN | False | NaN | NaN | NaN | NaN | europe | europe_central_asia |
1 | abw | others | high_income | high_income | True | AW | ABW | 533.0 | NaN | coastline | ... | Aruba | un_not_least_developed | un_latin_america_and_the_caribbean | False | unhcr_americas | NaN | AW | NaN | americas | america |
2 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | Afghanistan | un_least_developed | un_central_and_southern_asia | True | unhcr_asia_pacific | sa | AF | rest | asia | south_asia |
3 | ago | g77 | middle_income | lower_middle_income | True | AO | AGO | 24.0 | NaN | coastline | ... | Angola | un_least_developed | un_sub_saharan_africa | True | unhcr_southern_africa | ssa | AO | rest | africa | sub_saharan_africa |
4 | aia | others | NaN | NaN | True | AI | AIA | 660.0 | NaN | coastline | ... | Anguilla | un_not_least_developed | un_latin_america_and_the_caribbean | False | unhcr_americas | NaN | AI | NaN | americas | america |
5 rows × 23 columns
Now let's get Life-Expectancy Data¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--gapminder_world/master/'
file = 'ddf--datapoints--life_expectancy_years--by--geo--time.csv'
life_exp = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
life_exp.head()
geo | life_expectancy_years | time | |
---|---|---|---|
0 | afg | 28.21 | 1800 |
1 | afg | 28.20 | 1801 |
2 | afg | 28.19 | 1802 |
3 | afg | 28.18 | 1803 |
4 | afg | 28.17 | 1804 |
Since it includes projections, let's drop values after {{year}}¶
life_exp = life_exp.loc[life_exp.time<=year].reset_index(drop=True)
life_exp.head()
geo | life_expectancy_years | time | |
---|---|---|---|
0 | afg | 28.21 | 1800 |
1 | afg | 28.20 | 1801 |
2 | afg | 28.19 | 1802 |
3 | afg | 28.18 | 1803 |
4 | afg | 28.17 | 1804 |
Let's get GDPpc¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--gdp_per_capita_cppp/master/'
file = 'ddf--datapoints--income_per_person_gdppercapita_ppp_inflation_adjusted--by--geo--time.csv'
gdppc_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
gdppc_gm.head()
geo | time | income_per_person_gdppercapita_ppp_inflation_adjusted | |
---|---|---|---|
0 | afg | 1800 | 683 |
1 | afg | 1801 | 683 |
2 | afg | 1802 | 683 |
3 | afg | 1803 | 683 |
4 | afg | 1804 | 683 |
Since it includes projections, let's drop values after {{year}}¶
gdppc_gm = gdppc_gm.loc[gdppc_gm.time<=year].reset_index(drop=True)
gdppc_gm.head()
geo | time | income_per_person_gdppercapita_ppp_inflation_adjusted | |
---|---|---|---|
0 | afg | 1800 | 683 |
1 | afg | 1801 | 683 |
2 | afg | 1802 | 683 |
3 | afg | 1803 | 683 |
4 | afg | 1804 | 683 |
... | ... | ... | ... |
43090 | zwe | 2016 | 3678 |
43091 | zwe | 2017 | 3796 |
43092 | zwe | 2018 | 3923 |
43093 | zwe | 2019 | 3630 |
43094 | zwe | 2020 | 3374 |
43095 rows × 3 columns
Let's get TFR¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--children_per_woman_total_fertility--by--geo--time.csv'
tfr_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
tfr_gm.head()
geo | time | children_per_woman_total_fertility | |
---|---|---|---|
0 | abw | 1800 | 5.64 |
1 | abw | 1801 | 5.64 |
2 | abw | 1802 | 5.64 |
3 | abw | 1803 | 5.64 |
4 | abw | 1804 | 5.64 |
Since it includes projections, let's drop values after {{year}}¶
tfr_gm = tfr_gm.loc[tfr_gm.time<=year].reset_index(drop=True)
tfr_gm.head()
geo | time | children_per_woman_total_fertility | |
---|---|---|---|
0 | abw | 1800 | 5.64 |
1 | abw | 1801 | 5.64 |
2 | abw | 1802 | 5.64 |
3 | abw | 1803 | 5.64 |
4 | abw | 1804 | 5.64 |
Let's get CDR¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--crude_death_rate_deaths_per_1000_population--by--geo--time.csv'
cdr_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
cdr_gm = cdr_gm.loc[cdr_gm.time<=year].reset_index(drop=True)
cdr_gm.head()
geo | time | crude_death_rate_deaths_per_1000_population | |
---|---|---|---|
0 | abw | 1950 | 10.383 |
1 | abw | 1951 | 10.029 |
2 | abw | 1952 | 9.394 |
3 | abw | 1953 | 8.858 |
4 | abw | 1954 | 8.331 |
Let's get CBR¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--crude_birth_rate_births_per_1000_population--by--geo--time.csv'
cbr_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
cbr_gm = cbr_gm.loc[cbr_gm.time<=year].reset_index(drop=True)
cbr_gm.head()
geo | time | crude_birth_rate_births_per_1000_population | |
---|---|---|---|
0 | abw | 1800 | 39.51 |
1 | abw | 1801 | 39.51 |
2 | abw | 1802 | 39.51 |
3 | abw | 1803 | 39.51 |
4 | abw | 1804 | 39.51 |
Let's get Contraception use¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--contraceptive_use_percent_of_women_ages_15_49--by--geo--time.csv'
contraception_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
contraception_gm = contraception_gm.loc[contraception_gm.time<=year].reset_index(drop=True)
contraception_gm.head()
geo | time | contraceptive_use_percent_of_women_ages_15_49 | |
---|---|---|---|
0 | afg | 2000 | 5.3 |
1 | afg | 2003 | 10.3 |
2 | afg | 2005 | 13.6 |
3 | afg | 2006 | 18.6 |
4 | afg | 2008 | 22.8 |
Let's get Food Supply¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--food_supply_kilocalories_per_person_and_day--by--geo--time.csv'
food_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
food_gm = food_gm.loc[food_gm.time<=year].reset_index(drop=True)
food_gm.head()
geo | time | food_supply_kilocalories_per_person_and_day | |
---|---|---|---|
0 | afg | 1961 | 2999 |
1 | afg | 1962 | 2917 |
2 | afg | 1963 | 2698 |
3 | afg | 1964 | 2953 |
4 | afg | 1965 | 2956 |
Let's get GDP per worker¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--gdpperemployee_us_inflation_adjusted--by--geo--time.csv'
gdppc_pw_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
gdppc_pw_gm = gdppc_pw_gm.loc[cbr_gm.time<=year].reset_index(drop=True)
gdppc_pw_gm.head()
geo | time | gdpperemployee_us_inflation_adjusted | |
---|---|---|---|
0 | afg | 1991 | 2393.89 |
1 | afg | 1992 | 2226.67 |
2 | afg | 1993 | 1529.35 |
3 | afg | 1994 | 1100.26 |
4 | afg | 1995 | 1550.83 |
Merge¶
df = countries_gm.merge(life_exp, left_on='country', right_on='geo', how='right')
print(df.shape)
df = df.merge(gdppc_gm, on=['geo', 'time'], how='inner')
print(df.shape)
df = df.merge(tfr_gm, on=['geo', 'time'], how='left')
df = df.merge(cbr_gm, on=['geo', 'time'], how='left')
df = df.merge(cdr_gm, on=['geo', 'time'], how='left')
df = df.merge(contraception_gm, on=['geo', 'time'], how='left')
df = df.merge(food_gm, on=['geo', 'time'], how='left')
df = df.merge(gdppc_pw_gm, on=['geo', 'time'], how='left')
df['year'] = df['time']
df
(43444, 26) (40303, 27)
country | g77_and_oecd_countries | income_3groups | income_groups | is--country | iso3166_1_alpha2 | iso3166_1_alpha3 | iso3166_1_numeric | iso3166_2 | landlocked | ... | life_expectancy_years | time | income_per_person_gdppercapita_ppp_inflation_adjusted | children_per_woman_total_fertility | crude_birth_rate_births_per_1000_population | crude_death_rate_deaths_per_1000_population | contraceptive_use_percent_of_women_ages_15_49 | food_supply_kilocalories_per_person_and_day | gdpperemployee_us_inflation_adjusted | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.21 | 1800 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1800 |
1 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.20 | 1801 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1801 |
2 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.19 | 1802 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1802 |
3 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.18 | 1803 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1803 |
4 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.17 | 1804 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1804 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
40298 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 56.70 | 2011 | 4190 | 5.29 | 37.91 | 11.600 | NaN | NaN | 2139.73 | 2011 |
40299 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 56.80 | 2012 | 2196 | 5.20 | 37.51 | 11.104 | NaN | NaN | 2047.46 | 2012 |
40300 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 57.20 | 2013 | 2426 | 5.11 | 37.13 | 11.136 | NaN | NaN | 2258.47 | 2013 |
40301 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 57.60 | 2014 | 2461 | 5.02 | 36.76 | 11.493 | NaN | NaN | 2282.65 | 2014 |
40302 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 58.00 | 2015 | 2162 | 4.94 | 36.40 | 11.072 | NaN | NaN | 1998.36 | 2015 |
40303 rows × 34 columns
Let's get country groups etc from WDI as before¶
Steps¶
wbcountries = wb.get_countries()
wbcountries = wbcountries.loc[wbcountries.region.isin(['Aggregates'])==False].reset_index(drop=True)
wbcountries['name'] = wbcountries.name.str.strip()
wbcountries['incomeLevel'] = wbcountries['incomeLevel'].str.title()
wbcountries.loc[wbcountries.iso3c=='VEN', 'incomeLevel'] = 'Upper Middle Income'
df['iso3c'] = df['country'].str.upper()
wdi = wbcountries.merge(df, on='iso3c', suffixes=['', '_GM'])
wdi.head()
iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | ... | life_expectancy_years | time | income_per_person_gdppercapita_ppp_inflation_adjusted | children_per_woman_total_fertility | crude_birth_rate_births_per_1000_population | crude_death_rate_deaths_per_1000_population | contraceptive_use_percent_of_women_ages_15_49 | food_supply_kilocalories_per_person_and_day | gdpperemployee_us_inflation_adjusted | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.21 | 1800 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1800 |
1 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.20 | 1801 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1801 |
2 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.19 | 1802 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1802 |
3 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.18 | 1803 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1803 |
4 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.17 | 1804 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1804 |
5 rows × 44 columns
Regression Analysis with¶
url = 'https://www.statsmodels.org/stable/index.html'
IFrame(url, width=800, height=400)
Linear Regressions using OLS¶
It is very easy to run a regression in statsmodels.
We only need
- Data in a pandas dataframe
- An equation we want to estimate
Equations are strings of the form
'dependent_variable ~ indep_var_1 + function(indep_var2) + C(indep_var3)'
where:
dependent_variable
is the outcome variable of interestindep_var_1
is the first independent variablefunction(indep_var2)
is a function of another independent variable (if needed)C(indep_var3)
defines fixed-effects/dummies based on categories given in indep_var3
Simple Regression of Log[Life Expectancy] and Log[GDP pc]¶
wdi['ln_life_exp'] = wdi['life_expectancy_years'].apply(np.log)
wdi['ln_gdp_pc'] = wdi['income_per_person_gdppercapita_ppp_inflation_adjusted'].apply(np.log)
wdi['tfr'] = wdi['children_per_woman_total_fertility']
wdi['life_exp'] = wdi['life_expectancy_years']
wdi['gdp_pc'] = wdi['income_per_person_gdppercapita_ppp_inflation_adjusted']
year = wdi['year'].max()
yvar = 'ln_life_exp'
xvar = 'ln_gdp_pc'
zvar = 'tfr'
dffig = wdi.loc[wdi.year==year]\
.dropna(subset=[xvar, yvar, zvar])\
.sort_values(by='region').reset_index(drop=True)
dffig.head()
iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | ... | crude_death_rate_deaths_per_1000_population | contraceptive_use_percent_of_women_ages_15_49 | food_supply_kilocalories_per_person_and_day | gdpperemployee_us_inflation_adjusted | year | ln_life_exp | ln_gdp_pc | tfr | life_exp | gdp_pc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | PRK | KP | Korea, Dem. People's Rep. | East Asia & Pacific | East Asia & Pacific (excluding high income) | Low Income | Not classified | Pyongyang | 125.754 | 39.03190 | ... | 8.088 | NaN | 2061.0 | 1110.23 | 2015 | 4.268298 | 7.625107 | 1.92 | 71.4 | 2049 |
1 | FSM | FM | Micronesia, Fed. Sts. | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IDA | Palikir | 158.185 | 6.91771 | ... | 5.008 | NaN | NaN | NaN | 2015 | 4.204693 | 8.145550 | 3.19 | 67.0 | 3448 |
2 | MNG | MN | Mongolia | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IBRD | Ulaanbaatar | 106.937 | 47.91290 | ... | 6.158 | NaN | 2483.0 | 9538.68 | 2015 | 4.178992 | 9.306559 | 2.79 | 65.3 | 11010 |
3 | THA | TH | Thailand | East Asia & Pacific | East Asia & Pacific (excluding high income) | Upper Middle Income | IBRD | Bangkok | 100.521 | 13.73080 | ... | 6.724 | NaN | 2782.0 | 10198.24 | 2015 | 4.318821 | 9.698000 | 1.50 | 75.1 | 16285 |
4 | LAO | LA | Lao PDR | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IDA | Vientiane | 102.177 | 18.58260 | ... | 7.513 | NaN | 2738.0 | 2981.26 | 2015 | 4.195697 | 8.786304 | 2.76 | 66.4 | 6544 |
5 rows × 49 columns
mod = smf.ols(formula='ln_life_exp ~ ln_gdp_pc', data=dffig, missing='drop').fit()
mod.summary2()
Model: | OLS | Adj. R-squared: | 0.647 |
Dependent Variable: | ln_life_exp | AIC: | -470.7658 |
Date: | 2024-02-22 09:01 | BIC: | -464.3359 |
No. Observations: | 184 | Log-Likelihood: | 237.38 |
Df Model: | 1 | F-statistic: | 336.1 |
Df Residuals: | 182 | Prob (F-statistic): | 3.29e-43 |
R-squared: | 0.649 | Scale: | 0.0044842 |
Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.5419 | 0.0398 | 89.0717 | 0.0000 | 3.4635 | 3.6204 |
ln_gdp_pc | 0.0781 | 0.0043 | 18.3335 | 0.0000 | 0.0697 | 0.0865 |
Omnibus: | 61.936 | Durbin-Watson: | 1.817 |
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 167.770 |
Skew: | -1.422 | Prob(JB): | 0.000 |
Kurtosis: | 6.714 | Condition No.: | 76 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Plot Data and OLS Regression Predictions¶
pred_ols = mod.get_prediction()
iv_l = pred_ols.summary_frame()["mean_ci_lower"]
iv_u = pred_ols.summary_frame()["mean_ci_upper"]
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(dffig[xvar], dffig[yvar], "o", label="data")
ax.plot(dffig[xvar], mod.fittedvalues, "r--.", label="OLS")
ax.plot(dffig[xvar], iv_u, "r--")
ax.plot(dffig[xvar], iv_l, "r--")
ax.legend(loc="best")
<matplotlib.legend.Legend at 0x163c25430>
fig
Simple Regression of Log[Life Expectancy] and Log[GDP pc] for WB region dummies¶
mod2 = smf.ols(formula='ln_life_exp ~ ln_gdp_pc + C(region)', data=dffig, missing='drop').fit()
mod2.summary2()
Model: | OLS | Adj. R-squared: | 0.745 |
Dependent Variable: | ln_life_exp | AIC: | -524.8326 |
Date: | 2024-02-22 09:01 | BIC: | -499.1131 |
No. Observations: | 184 | Log-Likelihood: | 270.42 |
Df Model: | 7 | F-statistic: | 77.35 |
Df Residuals: | 176 | Prob (F-statistic): | 2.26e-50 |
R-squared: | 0.755 | Scale: | 0.0032382 |
Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.7983 | 0.0474 | 80.1247 | 0.0000 | 3.7047 | 3.8919 |
C(region)[T.Europe & Central Asia] | 0.0208 | 0.0147 | 1.4109 | 0.1600 | -0.0083 | 0.0499 |
C(region)[T.Latin America & Caribbean ] | 0.0151 | 0.0152 | 0.9891 | 0.3240 | -0.0150 | 0.0451 |
C(region)[T.Middle East & North Africa] | 0.0294 | 0.0169 | 1.7354 | 0.0844 | -0.0040 | 0.0627 |
C(region)[T.North America] | 0.0255 | 0.0426 | 0.5985 | 0.5503 | -0.0586 | 0.1097 |
C(region)[T.South Asia] | -0.0052 | 0.0231 | -0.2255 | 0.8219 | -0.0509 | 0.0405 |
C(region)[T.Sub-Saharan Africa ] | -0.0911 | 0.0149 | -6.1297 | 0.0000 | -0.1205 | -0.0618 |
ln_gdp_pc | 0.0518 | 0.0050 | 10.2953 | 0.0000 | 0.0419 | 0.0617 |
Omnibus: | 38.972 | Durbin-Watson: | 2.290 |
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 79.400 |
Skew: | -0.986 | Prob(JB): | 0.000 |
Kurtosis: | 5.543 | Condition No.: | 111 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Simple Regression of Log[Life Expectancy] and Log[GDP pc] and TFR, accounting for WB region dummies¶
mod3 = smf.ols(formula='ln_life_exp ~ ln_gdp_pc + tfr + C(region)', data=dffig, missing='drop').fit()
mod3.summary2()
Model: | OLS | Adj. R-squared: | 0.754 |
Dependent Variable: | ln_life_exp | AIC: | -530.3147 |
Date: | 2024-02-22 09:01 | BIC: | -501.3803 |
No. Observations: | 184 | Log-Likelihood: | 274.16 |
Df Model: | 8 | F-statistic: | 71.00 |
Df Residuals: | 175 | Prob (F-statistic): | 6.11e-51 |
R-squared: | 0.764 | Scale: | 0.0031269 |
Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.9325 | 0.0682 | 57.6766 | 0.0000 | 3.7979 | 4.0670 |
C(region)[T.Europe & Central Asia] | 0.0161 | 0.0146 | 1.1047 | 0.2708 | -0.0127 | 0.0449 |
C(region)[T.Latin America & Caribbean ] | 0.0115 | 0.0150 | 0.7662 | 0.4446 | -0.0181 | 0.0412 |
C(region)[T.Middle East & North Africa] | 0.0355 | 0.0168 | 2.1151 | 0.0358 | 0.0024 | 0.0686 |
C(region)[T.North America] | 0.0279 | 0.0419 | 0.6669 | 0.5057 | -0.0548 | 0.1107 |
C(region)[T.South Asia] | -0.0095 | 0.0228 | -0.4171 | 0.6771 | -0.0545 | 0.0355 |
C(region)[T.Sub-Saharan Africa ] | -0.0687 | 0.0168 | -4.0911 | 0.0001 | -0.1019 | -0.0356 |
ln_gdp_pc | 0.0419 | 0.0061 | 6.8217 | 0.0000 | 0.0298 | 0.0541 |
tfr | -0.0168 | 0.0062 | -2.6950 | 0.0077 | -0.0290 | -0.0045 |
Omnibus: | 56.848 | Durbin-Watson: | 2.341 |
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 168.323 |
Skew: | -1.255 | Prob(JB): | 0.000 |
Kurtosis: | 6.956 | Condition No.: | 164 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Producing a nice table with stargazer¶
url = 'https://nbviewer.org/github/mwburke/stargazer/blob/master/examples.ipynb'
IFrame(url, width=800, height=400)
Add the estimated models to Stargazer¶
stargazer = Stargazer([mod, mod2, mod3])
stargazer.significant_digits(2)
stargazer.show_degrees_of_freedom(False)
#stargazer.dep_var_name = ''
stargazer.dependent_variable = ' Log[Life Expectancy (' + str(year) + ')]'
stargazer.custom_columns(['Simple', 'WB Regs', 'TFR'], [1, 1, 1])
#stargazer.show_model_numbers(False)
stargazer.rename_covariates({'ln_gdp_pc':' Log[GDP per capita (' + str(year) + ')]',
'tfr':'Total Fertility Rate (' + str(year) + ')'})
stargazer.add_line('WB Region FE', ['No', 'Yes', 'Yes'], LineLocation.FOOTER_TOP)
stargazer.covariate_order(['ln_gdp_pc', 'tfr'])
stargazer.cov_spacing = 2
stargazer
Dependent variable: Log[Life Expectancy (2015)] | |||
Simple | WB Regs | TFR | |
(1) | (2) | (3) | |
Log[GDP per capita (2015)] | 0.08*** | 0.05*** | 0.04*** |
(0.00) | (0.01) | (0.01) | |
Total Fertility Rate (2015) | -0.02*** | ||
(0.01) | |||
WB Region FE | No | Yes | Yes |
Observations | 184 | 184 | 184 |
R2 | 0.65 | 0.75 | 0.76 |
Adjusted R2 | 0.65 | 0.74 | 0.75 |
Residual Std. Error | 0.07 | 0.06 | 0.06 |
F Statistic | 336.12*** | 77.35*** | 71.00*** |
Note: | *p<0.1; **p<0.05; ***p<0.01 |
To show the table¶
HTML(stargazer.render_html())
HTML(stargazer.render_html())
Dependent variable: Log[Life Expectancy (2015)] | |||
Simple | WB Regs | TFR | |
(1) | (2) | (3) | |
Log[GDP per capita (2015)] | 0.08*** | 0.05*** | 0.04*** |
(0.00) | (0.01) | (0.01) | |
Total Fertility Rate (2015) | -0.02*** | ||
(0.01) | |||
WB Region FE | No | Yes | Yes |
Observations | 184 | 184 | 184 |
R2 | 0.65 | 0.75 | 0.76 |
Adjusted R2 | 0.65 | 0.74 | 0.75 |
Residual Std. Error | 0.07 | 0.06 | 0.06 |
F Statistic | 336.12*** | 77.35*** | 71.00*** |
Note: | *p<0.1; **p<0.05; ***p<0.01 |
To export the table to another file¶
file_name = "gapminder_table.html" #Include directory path if needed
html_file = open(pathgraphs + file_name, "w" ) #This will overwrite an existing file
html_file.write( stargazer.render_html() )
html_file.close()
url = pathgraphs + 'table.html'
url = 'https://smu-econ-growth.github.io/EconGrowthUG-Slides-Working-with-GapMinder/gapminder_table.html'
IFrame(url, width=500, height=400)
Plotting GapMinder data¶
Many options¶
- Since the data is a pandas dataframe, we could just use its functions as we did previously
- Use the seaborn package
- Use the plotly package
- Use the plotnine package
Plots with¶
url = 'https://seaborn.pydata.org/examples/index.html'
IFrame(url, width=800, height=400)
Let's create a Scatterplot with varying point sizes and hues that plots the latitude and Log[GDP per capita] of each country and uses its log-population and the WB region in the last available year as the size and hue.
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
g = sns.relplot(x="ln_gdp_pc",
y="ln_life_exp",
data=dffig,
hue="region",
hue_order = dffig.region.drop_duplicates().sort_values(),
style="region",
style_order = dffig.region.drop_duplicates().sort_values(),
size="tfr",
sizes=(10, 400),
alpha=.5,
height=6,
aspect=2,
palette="muted",
)
g.set_axis_labels('Log[GDP per capita (' + str(year) + ')]', 'Log[Life Expectancy (' + str(year) + ')]')
<seaborn.axisgrid.FacetGrid at 0x15cfbf820>
g.fig
Using scatterplot
¶
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
fig, ax = plt.subplots()
sns.scatterplot(x="ln_gdp_pc",
y="ln_life_exp",
data=dffig,
hue="region",
hue_order = dffig.region.drop_duplicates().sort_values(),
style="region",
style_order = dffig.region.drop_duplicates().sort_values(),
size="tfr",
sizes=(10, 400),
alpha=.5,
palette="muted",
ax=ax
)
ax.set_xlabel('Latitude')
ax.set_ylabel('Log[GDP per capita (' + str(year) + ')]')
ax.legend(fontsize=10)
<matplotlib.legend.Legend at 0x15cd69640>
fig