- GapMinder is a teaching project that identifies systematic misconceptions about important global trends and proportions and uses reliable data to develop easy to understand teaching materials to rid people of their misconceptions.
- Compiles and makes available many useful cross-country data sources
- Free and easy to access (once you understand how)
- Lot's of variables are available, from multiple sources covering the period after 1800.
Setup¶
Import Modules and set Paths¶
# Basic Packages
from __future__ import division
import os
from datetime import datetime
# Web & file access
import requests
import io
# Import display options for showing websites
from IPython.display import IFrame, HTML
# Data
import pandas as pd
import numpy as np
from pandas_datareader import data, wb
# GIS & maps
import geopandas as gpd
gp = gpd
import georasters as gr
import geoplot as gplt
import geoplot.crs as gcrs
import mapclassify as mc
import textwrap
# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%pylab --no-import-all
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
import plotly.express as px
import plotly.graph_objects as go
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
# Next line can import all of plotnine, but may overwrite things? Better import each function/object you need
#from plotnine import *
Using matplotlib backend: <object object at 0x152374fa0> %pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
# Data Munging
from itertools import product, combinations
import difflib
import pycountry
import geocoder
from geonamescache.mappers import country
mapper = country(from_key='name', to_key='iso3')
mapper2 = country(from_key='iso3', to_key='iso')
mapper3 = country(from_key='iso3', to_key='name')
# Regressions & Stats
from scipy.stats import norm
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer, LineLocation
# Paths
pathout = './data/'
if not os.path.exists(pathout):
os.mkdir(pathout)
pathgraphs = './graphs/'
if not os.path.exists(pathgraphs):
os.mkdir(pathgraphs)
currentYear = datetime.now().year
year = min(2020, currentYear-2)
Getting data from GapMinder¶
There are two ways of getting data from GapMinder:
Use GapMinder Data Website and select a series of interest and download it as a CSV or Excel file.
Download the series of interest from GapMinder's Github reporsitories:
- Systema Globalis (indicators inherited from Gapminder World, many are still updated)
- Fast Track (indicators they compile manually)
- World Development Indicators (WDI) (direct copy from World Bank repository)
Below we will access GapMinder's data via Github, since it is much easier and efficient.
So, we can get data from their GitHub site.
Two approaches:
- Clone repository and process (we could process everything to create one giant database)
- Access specific files we are interested in
Here we'll follow approach 2
Let's start by getting country names, codes, etc.¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/'
file = 'ddf--entities--geo--country.csv'
countries_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
countries_gm.head()
country | g77_and_oecd_countries | income_3groups | income_groups | is--country | iso3166_1_alpha2 | iso3166_1_alpha3 | iso3166_1_numeric | iso3166_2 | landlocked | ... | name | un_sdg_ldc | un_sdg_region | un_state | unhcr_region | unicef_region | unicode_region_subtag | west_and_rest | world_4region | world_6region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | abkh | others | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | ... | Abkhazia | NaN | NaN | False | NaN | NaN | NaN | NaN | europe | europe_central_asia |
1 | abw | others | high_income | high_income | True | AW | ABW | 533.0 | NaN | coastline | ... | Aruba | un_not_least_developed | un_latin_america_and_the_caribbean | False | unhcr_americas | NaN | AW | NaN | americas | america |
2 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | Afghanistan | un_least_developed | un_central_and_southern_asia | True | unhcr_asia_pacific | sa | AF | rest | asia | south_asia |
3 | ago | g77 | middle_income | lower_middle_income | True | AO | AGO | 24.0 | NaN | coastline | ... | Angola | un_least_developed | un_sub_saharan_africa | True | unhcr_southern_africa | ssa | AO | rest | africa | sub_saharan_africa |
4 | aia | others | NaN | NaN | True | AI | AIA | 660.0 | NaN | coastline | ... | Anguilla | un_not_least_developed | un_latin_america_and_the_caribbean | False | unhcr_americas | NaN | AI | NaN | americas | america |
5 rows × 23 columns
Now let's get Life-Expectancy Data¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--gapminder_world/master/'
file = 'ddf--datapoints--life_expectancy_years--by--geo--time.csv'
life_exp = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
life_exp.head()
geo | life_expectancy_years | time | |
---|---|---|---|
0 | afg | 28.21 | 1800 |
1 | afg | 28.20 | 1801 |
2 | afg | 28.19 | 1802 |
3 | afg | 28.18 | 1803 |
4 | afg | 28.17 | 1804 |
Since it includes projections, let's drop values after {{year}}¶
life_exp = life_exp.loc[life_exp.time<=year].reset_index(drop=True)
life_exp.head()
geo | life_expectancy_years | time | |
---|---|---|---|
0 | afg | 28.21 | 1800 |
1 | afg | 28.20 | 1801 |
2 | afg | 28.19 | 1802 |
3 | afg | 28.18 | 1803 |
4 | afg | 28.17 | 1804 |
Let's get GDPpc¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--gdp_per_capita_cppp/master/'
file = 'ddf--datapoints--income_per_person_gdppercapita_ppp_inflation_adjusted--by--geo--time.csv'
gdppc_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
gdppc_gm.head()
geo | time | income_per_person_gdppercapita_ppp_inflation_adjusted | |
---|---|---|---|
0 | afg | 1800 | 683 |
1 | afg | 1801 | 683 |
2 | afg | 1802 | 683 |
3 | afg | 1803 | 683 |
4 | afg | 1804 | 683 |
Since it includes projections, let's drop values after {{year}}¶
gdppc_gm = gdppc_gm.loc[gdppc_gm.time<=year].reset_index(drop=True)
gdppc_gm.head()
geo | time | income_per_person_gdppercapita_ppp_inflation_adjusted | |
---|---|---|---|
0 | afg | 1800 | 683 |
1 | afg | 1801 | 683 |
2 | afg | 1802 | 683 |
3 | afg | 1803 | 683 |
4 | afg | 1804 | 683 |
... | ... | ... | ... |
43090 | zwe | 2016 | 3678 |
43091 | zwe | 2017 | 3796 |
43092 | zwe | 2018 | 3923 |
43093 | zwe | 2019 | 3630 |
43094 | zwe | 2020 | 3374 |
43095 rows × 3 columns
Let's get TFR¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--children_per_woman_total_fertility--by--geo--time.csv'
tfr_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
tfr_gm.head()
geo | time | children_per_woman_total_fertility | |
---|---|---|---|
0 | abw | 1800 | 5.64 |
1 | abw | 1801 | 5.64 |
2 | abw | 1802 | 5.64 |
3 | abw | 1803 | 5.64 |
4 | abw | 1804 | 5.64 |
Since it includes projections, let's drop values after {{year}}¶
tfr_gm = tfr_gm.loc[tfr_gm.time<=year].reset_index(drop=True)
tfr_gm.head()
geo | time | children_per_woman_total_fertility | |
---|---|---|---|
0 | abw | 1800 | 5.64 |
1 | abw | 1801 | 5.64 |
2 | abw | 1802 | 5.64 |
3 | abw | 1803 | 5.64 |
4 | abw | 1804 | 5.64 |
Let's get CDR¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--crude_death_rate_deaths_per_1000_population--by--geo--time.csv'
cdr_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
cdr_gm = cdr_gm.loc[cdr_gm.time<=year].reset_index(drop=True)
cdr_gm.head()
geo | time | crude_death_rate_deaths_per_1000_population | |
---|---|---|---|
0 | abw | 1950 | 10.383 |
1 | abw | 1951 | 10.029 |
2 | abw | 1952 | 9.394 |
3 | abw | 1953 | 8.858 |
4 | abw | 1954 | 8.331 |
Let's get CBR¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--crude_birth_rate_births_per_1000_population--by--geo--time.csv'
cbr_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
cbr_gm = cbr_gm.loc[cbr_gm.time<=year].reset_index(drop=True)
cbr_gm.head()
geo | time | crude_birth_rate_births_per_1000_population | |
---|---|---|---|
0 | abw | 1800 | 39.51 |
1 | abw | 1801 | 39.51 |
2 | abw | 1802 | 39.51 |
3 | abw | 1803 | 39.51 |
4 | abw | 1804 | 39.51 |
Let's get Contraception use¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--contraceptive_use_percent_of_women_ages_15_49--by--geo--time.csv'
contraception_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
contraception_gm = contraception_gm.loc[contraception_gm.time<=year].reset_index(drop=True)
contraception_gm.head()
geo | time | contraceptive_use_percent_of_women_ages_15_49 | |
---|---|---|---|
0 | afg | 2000 | 5.3 |
1 | afg | 2003 | 10.3 |
2 | afg | 2005 | 13.6 |
3 | afg | 2006 | 18.6 |
4 | afg | 2008 | 22.8 |
Let's get Food Supply¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--food_supply_kilocalories_per_person_and_day--by--geo--time.csv'
food_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
food_gm = food_gm.loc[food_gm.time<=year].reset_index(drop=True)
food_gm.head()
geo | time | food_supply_kilocalories_per_person_and_day | |
---|---|---|---|
0 | afg | 1961 | 2999 |
1 | afg | 1962 | 2917 |
2 | afg | 1963 | 2698 |
3 | afg | 1964 | 2953 |
4 | afg | 1965 | 2956 |
Let's get GDP per worker¶
url = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/master/countries-etc-datapoints/'
file = 'ddf--datapoints--gdpperemployee_us_inflation_adjusted--by--geo--time.csv'
gdppc_pw_gm = pd.read_csv(url + file,
encoding='utf-8', keep_default_na=False, na_values='')
gdppc_pw_gm = gdppc_pw_gm.loc[cbr_gm.time<=year].reset_index(drop=True)
gdppc_pw_gm.head()
geo | time | gdpperemployee_us_inflation_adjusted | |
---|---|---|---|
0 | afg | 1991 | 2393.89 |
1 | afg | 1992 | 2226.67 |
2 | afg | 1993 | 1529.35 |
3 | afg | 1994 | 1100.26 |
4 | afg | 1995 | 1550.83 |
Merge¶
df = countries_gm.merge(life_exp, left_on='country', right_on='geo', how='right')
print(df.shape)
df = df.merge(gdppc_gm, on=['geo', 'time'], how='inner')
print(df.shape)
df = df.merge(tfr_gm, on=['geo', 'time'], how='left')
df = df.merge(cbr_gm, on=['geo', 'time'], how='left')
df = df.merge(cdr_gm, on=['geo', 'time'], how='left')
df = df.merge(contraception_gm, on=['geo', 'time'], how='left')
df = df.merge(food_gm, on=['geo', 'time'], how='left')
df = df.merge(gdppc_pw_gm, on=['geo', 'time'], how='left')
df['year'] = df['time']
df
(43444, 26) (40303, 27)
country | g77_and_oecd_countries | income_3groups | income_groups | is--country | iso3166_1_alpha2 | iso3166_1_alpha3 | iso3166_1_numeric | iso3166_2 | landlocked | ... | life_expectancy_years | time | income_per_person_gdppercapita_ppp_inflation_adjusted | children_per_woman_total_fertility | crude_birth_rate_births_per_1000_population | crude_death_rate_deaths_per_1000_population | contraceptive_use_percent_of_women_ages_15_49 | food_supply_kilocalories_per_person_and_day | gdpperemployee_us_inflation_adjusted | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.21 | 1800 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1800 |
1 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.20 | 1801 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1801 |
2 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.19 | 1802 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1802 |
3 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.18 | 1803 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1803 |
4 | afg | g77 | low_income | low_income | True | AF | AFG | 4.0 | NaN | landlocked | ... | 28.17 | 1804 | 683 | 7.00 | 48.14 | NaN | NaN | NaN | NaN | 1804 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
40298 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 56.70 | 2011 | 4190 | 5.29 | 37.91 | 11.600 | NaN | NaN | 2139.73 | 2011 |
40299 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 56.80 | 2012 | 2196 | 5.20 | 37.51 | 11.104 | NaN | NaN | 2047.46 | 2012 |
40300 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 57.20 | 2013 | 2426 | 5.11 | 37.13 | 11.136 | NaN | NaN | 2258.47 | 2013 |
40301 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 57.60 | 2014 | 2461 | 5.02 | 36.76 | 11.493 | NaN | NaN | 2282.65 | 2014 |
40302 | ssd | NaN | low_income | low_income | True | SS | SSD | 728.0 | NaN | landlocked | ... | 58.00 | 2015 | 2162 | 4.94 | 36.40 | 11.072 | NaN | NaN | 1998.36 | 2015 |
40303 rows × 34 columns
Let's get country groups etc from WDI as before¶
Steps¶
wbcountries = wb.get_countries()
wbcountries = wbcountries.loc[wbcountries.region.isin(['Aggregates'])==False].reset_index(drop=True)
wbcountries['name'] = wbcountries.name.str.strip()
wbcountries['incomeLevel'] = wbcountries['incomeLevel'].str.title()
wbcountries.loc[wbcountries.iso3c=='VEN', 'incomeLevel'] = 'Upper Middle Income'
df['iso3c'] = df['country'].str.upper()
wdi = wbcountries.merge(df, on='iso3c', suffixes=['', '_GM'])
wdi.head()
iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | ... | life_expectancy_years | time | income_per_person_gdppercapita_ppp_inflation_adjusted | children_per_woman_total_fertility | crude_birth_rate_births_per_1000_population | crude_death_rate_deaths_per_1000_population | contraceptive_use_percent_of_women_ages_15_49 | food_supply_kilocalories_per_person_and_day | gdpperemployee_us_inflation_adjusted | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.21 | 1800 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1800 |
1 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.20 | 1801 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1801 |
2 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.19 | 1802 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1802 |
3 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.18 | 1803 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1803 |
4 | AFG | AF | Afghanistan | South Asia | South Asia | Low Income | IDA | Kabul | 69.1761 | 34.5228 | ... | 28.17 | 1804 | 683 | 7.0 | 48.14 | NaN | NaN | NaN | NaN | 1804 |
5 rows × 44 columns
Regression Analysis with¶
url = 'https://www.statsmodels.org/stable/index.html'
IFrame(url, width=800, height=400)
Linear Regressions using OLS¶
It is very easy to run a regression in statsmodels.
We only need
- Data in a pandas dataframe
- An equation we want to estimate
Equations are strings of the form
'dependent_variable ~ indep_var_1 + function(indep_var2) + C(indep_var3)'
where:
dependent_variable
is the outcome variable of interestindep_var_1
is the first independent variablefunction(indep_var2)
is a function of another independent variable (if needed)C(indep_var3)
defines fixed-effects/dummies based on categories given in indep_var3
Simple Regression of Log[Life Expectancy] and Log[GDP pc]¶
wdi['ln_life_exp'] = wdi['life_expectancy_years'].apply(np.log)
wdi['ln_gdp_pc'] = wdi['income_per_person_gdppercapita_ppp_inflation_adjusted'].apply(np.log)
wdi['tfr'] = wdi['children_per_woman_total_fertility']
wdi['life_exp'] = wdi['life_expectancy_years']
wdi['gdp_pc'] = wdi['income_per_person_gdppercapita_ppp_inflation_adjusted']
year = wdi['year'].max()
yvar = 'ln_life_exp'
xvar = 'ln_gdp_pc'
zvar = 'tfr'
dffig = wdi.loc[wdi.year==year]\
.dropna(subset=[xvar, yvar, zvar])\
.sort_values(by='region').reset_index(drop=True)
dffig.head()
iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | ... | crude_death_rate_deaths_per_1000_population | contraceptive_use_percent_of_women_ages_15_49 | food_supply_kilocalories_per_person_and_day | gdpperemployee_us_inflation_adjusted | year | ln_life_exp | ln_gdp_pc | tfr | life_exp | gdp_pc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | PRK | KP | Korea, Dem. People's Rep. | East Asia & Pacific | East Asia & Pacific (excluding high income) | Low Income | Not classified | Pyongyang | 125.754 | 39.03190 | ... | 8.088 | NaN | 2061.0 | 1110.23 | 2015 | 4.268298 | 7.625107 | 1.92 | 71.4 | 2049 |
1 | FSM | FM | Micronesia, Fed. Sts. | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IDA | Palikir | 158.185 | 6.91771 | ... | 5.008 | NaN | NaN | NaN | 2015 | 4.204693 | 8.145550 | 3.19 | 67.0 | 3448 |
2 | MNG | MN | Mongolia | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IBRD | Ulaanbaatar | 106.937 | 47.91290 | ... | 6.158 | NaN | 2483.0 | 9538.68 | 2015 | 4.178992 | 9.306559 | 2.79 | 65.3 | 11010 |
3 | THA | TH | Thailand | East Asia & Pacific | East Asia & Pacific (excluding high income) | Upper Middle Income | IBRD | Bangkok | 100.521 | 13.73080 | ... | 6.724 | NaN | 2782.0 | 10198.24 | 2015 | 4.318821 | 9.698000 | 1.50 | 75.1 | 16285 |
4 | LAO | LA | Lao PDR | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IDA | Vientiane | 102.177 | 18.58260 | ... | 7.513 | NaN | 2738.0 | 2981.26 | 2015 | 4.195697 | 8.786304 | 2.76 | 66.4 | 6544 |
5 rows × 49 columns
mod = smf.ols(formula='ln_life_exp ~ ln_gdp_pc', data=dffig, missing='drop').fit()
mod.summary2()
Model: | OLS | Adj. R-squared: | 0.647 |
Dependent Variable: | ln_life_exp | AIC: | -470.7658 |
Date: | 2024-02-22 09:01 | BIC: | -464.3359 |
No. Observations: | 184 | Log-Likelihood: | 237.38 |
Df Model: | 1 | F-statistic: | 336.1 |
Df Residuals: | 182 | Prob (F-statistic): | 3.29e-43 |
R-squared: | 0.649 | Scale: | 0.0044842 |
Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.5419 | 0.0398 | 89.0717 | 0.0000 | 3.4635 | 3.6204 |
ln_gdp_pc | 0.0781 | 0.0043 | 18.3335 | 0.0000 | 0.0697 | 0.0865 |
Omnibus: | 61.936 | Durbin-Watson: | 1.817 |
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 167.770 |
Skew: | -1.422 | Prob(JB): | 0.000 |
Kurtosis: | 6.714 | Condition No.: | 76 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Plot Data and OLS Regression Predictions¶
pred_ols = mod.get_prediction()
iv_l = pred_ols.summary_frame()["mean_ci_lower"]
iv_u = pred_ols.summary_frame()["mean_ci_upper"]
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(dffig[xvar], dffig[yvar], "o", label="data")
ax.plot(dffig[xvar], mod.fittedvalues, "r--.", label="OLS")
ax.plot(dffig[xvar], iv_u, "r--")
ax.plot(dffig[xvar], iv_l, "r--")
ax.legend(loc="best")
<matplotlib.legend.Legend at 0x163c25430>
fig
Simple Regression of Log[Life Expectancy] and Log[GDP pc] for WB region dummies¶
mod2 = smf.ols(formula='ln_life_exp ~ ln_gdp_pc + C(region)', data=dffig, missing='drop').fit()
mod2.summary2()
Model: | OLS | Adj. R-squared: | 0.745 |
Dependent Variable: | ln_life_exp | AIC: | -524.8326 |
Date: | 2024-02-22 09:01 | BIC: | -499.1131 |
No. Observations: | 184 | Log-Likelihood: | 270.42 |
Df Model: | 7 | F-statistic: | 77.35 |
Df Residuals: | 176 | Prob (F-statistic): | 2.26e-50 |
R-squared: | 0.755 | Scale: | 0.0032382 |
Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.7983 | 0.0474 | 80.1247 | 0.0000 | 3.7047 | 3.8919 |
C(region)[T.Europe & Central Asia] | 0.0208 | 0.0147 | 1.4109 | 0.1600 | -0.0083 | 0.0499 |
C(region)[T.Latin America & Caribbean ] | 0.0151 | 0.0152 | 0.9891 | 0.3240 | -0.0150 | 0.0451 |
C(region)[T.Middle East & North Africa] | 0.0294 | 0.0169 | 1.7354 | 0.0844 | -0.0040 | 0.0627 |
C(region)[T.North America] | 0.0255 | 0.0426 | 0.5985 | 0.5503 | -0.0586 | 0.1097 |
C(region)[T.South Asia] | -0.0052 | 0.0231 | -0.2255 | 0.8219 | -0.0509 | 0.0405 |
C(region)[T.Sub-Saharan Africa ] | -0.0911 | 0.0149 | -6.1297 | 0.0000 | -0.1205 | -0.0618 |
ln_gdp_pc | 0.0518 | 0.0050 | 10.2953 | 0.0000 | 0.0419 | 0.0617 |
Omnibus: | 38.972 | Durbin-Watson: | 2.290 |
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 79.400 |
Skew: | -0.986 | Prob(JB): | 0.000 |
Kurtosis: | 5.543 | Condition No.: | 111 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Simple Regression of Log[Life Expectancy] and Log[GDP pc] and TFR, accounting for WB region dummies¶
mod3 = smf.ols(formula='ln_life_exp ~ ln_gdp_pc + tfr + C(region)', data=dffig, missing='drop').fit()
mod3.summary2()
Model: | OLS | Adj. R-squared: | 0.754 |
Dependent Variable: | ln_life_exp | AIC: | -530.3147 |
Date: | 2024-02-22 09:01 | BIC: | -501.3803 |
No. Observations: | 184 | Log-Likelihood: | 274.16 |
Df Model: | 8 | F-statistic: | 71.00 |
Df Residuals: | 175 | Prob (F-statistic): | 6.11e-51 |
R-squared: | 0.764 | Scale: | 0.0031269 |
Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.9325 | 0.0682 | 57.6766 | 0.0000 | 3.7979 | 4.0670 |
C(region)[T.Europe & Central Asia] | 0.0161 | 0.0146 | 1.1047 | 0.2708 | -0.0127 | 0.0449 |
C(region)[T.Latin America & Caribbean ] | 0.0115 | 0.0150 | 0.7662 | 0.4446 | -0.0181 | 0.0412 |
C(region)[T.Middle East & North Africa] | 0.0355 | 0.0168 | 2.1151 | 0.0358 | 0.0024 | 0.0686 |
C(region)[T.North America] | 0.0279 | 0.0419 | 0.6669 | 0.5057 | -0.0548 | 0.1107 |
C(region)[T.South Asia] | -0.0095 | 0.0228 | -0.4171 | 0.6771 | -0.0545 | 0.0355 |
C(region)[T.Sub-Saharan Africa ] | -0.0687 | 0.0168 | -4.0911 | 0.0001 | -0.1019 | -0.0356 |
ln_gdp_pc | 0.0419 | 0.0061 | 6.8217 | 0.0000 | 0.0298 | 0.0541 |
tfr | -0.0168 | 0.0062 | -2.6950 | 0.0077 | -0.0290 | -0.0045 |
Omnibus: | 56.848 | Durbin-Watson: | 2.341 |
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 168.323 |
Skew: | -1.255 | Prob(JB): | 0.000 |
Kurtosis: | 6.956 | Condition No.: | 164 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Producing a nice table with stargazer¶
url = 'https://nbviewer.org/github/mwburke/stargazer/blob/master/examples.ipynb'
IFrame(url, width=800, height=400)
Add the estimated models to Stargazer¶
stargazer = Stargazer([mod, mod2, mod3])
stargazer.significant_digits(2)
stargazer.show_degrees_of_freedom(False)
#stargazer.dep_var_name = ''
stargazer.dependent_variable = ' Log[Life Expectancy (' + str(year) + ')]'
stargazer.custom_columns(['Simple', 'WB Regs', 'TFR'], [1, 1, 1])
#stargazer.show_model_numbers(False)
stargazer.rename_covariates({'ln_gdp_pc':' Log[GDP per capita (' + str(year) + ')]',
'tfr':'Total Fertility Rate (' + str(year) + ')'})
stargazer.add_line('WB Region FE', ['No', 'Yes', 'Yes'], LineLocation.FOOTER_TOP)
stargazer.covariate_order(['ln_gdp_pc', 'tfr'])
stargazer.cov_spacing = 2
stargazer
Dependent variable: Log[Life Expectancy (2015)] | |||
Simple | WB Regs | TFR | |
(1) | (2) | (3) | |
Log[GDP per capita (2015)] | 0.08*** | 0.05*** | 0.04*** |
(0.00) | (0.01) | (0.01) | |
Total Fertility Rate (2015) | -0.02*** | ||
(0.01) | |||
WB Region FE | No | Yes | Yes |
Observations | 184 | 184 | 184 |
R2 | 0.65 | 0.75 | 0.76 |
Adjusted R2 | 0.65 | 0.74 | 0.75 |
Residual Std. Error | 0.07 | 0.06 | 0.06 |
F Statistic | 336.12*** | 77.35*** | 71.00*** |
Note: | *p<0.1; **p<0.05; ***p<0.01 |
To show the table¶
HTML(stargazer.render_html())
HTML(stargazer.render_html())
Dependent variable: Log[Life Expectancy (2015)] | |||
Simple | WB Regs | TFR | |
(1) | (2) | (3) | |
Log[GDP per capita (2015)] | 0.08*** | 0.05*** | 0.04*** |
(0.00) | (0.01) | (0.01) | |
Total Fertility Rate (2015) | -0.02*** | ||
(0.01) | |||
WB Region FE | No | Yes | Yes |
Observations | 184 | 184 | 184 |
R2 | 0.65 | 0.75 | 0.76 |
Adjusted R2 | 0.65 | 0.74 | 0.75 |
Residual Std. Error | 0.07 | 0.06 | 0.06 |
F Statistic | 336.12*** | 77.35*** | 71.00*** |
Note: | *p<0.1; **p<0.05; ***p<0.01 |
To export the table to another file¶
file_name = "gapminder_table.html" #Include directory path if needed
html_file = open(pathgraphs + file_name, "w" ) #This will overwrite an existing file
html_file.write( stargazer.render_html() )
html_file.close()
url = pathgraphs + 'table.html'
url = 'https://smu-econ-growth.github.io/EconGrowthUG-Slides-Working-with-GapMinder/gapminder_table.html'
IFrame(url, width=500, height=400)
Plotting GapMinder data¶
Many options¶
- Since the data is a pandas dataframe, we could just use its functions as we did previously
- Use the seaborn package
- Use the plotly package
- Use the plotnine package
Plots with¶
url = 'https://seaborn.pydata.org/examples/index.html'
IFrame(url, width=800, height=400)
Let's create a Scatterplot with varying point sizes and hues that plots the latitude and Log[GDP per capita] of each country and uses its log-population and the WB region in the last available year as the size and hue.
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
g = sns.relplot(x="ln_gdp_pc",
y="ln_life_exp",
data=dffig,
hue="region",
hue_order = dffig.region.drop_duplicates().sort_values(),
style="region",
style_order = dffig.region.drop_duplicates().sort_values(),
size="tfr",
sizes=(10, 400),
alpha=.5,
height=6,
aspect=2,
palette="muted",
)
g.set_axis_labels('Log[GDP per capita (' + str(year) + ')]', 'Log[Life Expectancy (' + str(year) + ')]')
<seaborn.axisgrid.FacetGrid at 0x15cfbf820>
g.fig
Using scatterplot
¶
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
fig, ax = plt.subplots()
sns.scatterplot(x="ln_gdp_pc",
y="ln_life_exp",
data=dffig,
hue="region",
hue_order = dffig.region.drop_duplicates().sort_values(),
style="region",
style_order = dffig.region.drop_duplicates().sort_values(),
size="tfr",
sizes=(10, 400),
alpha=.5,
palette="muted",
ax=ax
)
ax.set_xlabel('Latitude')
ax.set_ylabel('Log[GDP per capita (' + str(year) + ')]')
ax.legend(fontsize=10)
<matplotlib.legend.Legend at 0x15cd69640>
fig
def my_xy_plot(dfin,
x='ln_gdp_pc',
y='ln_life_exp',
labelvar='iso3c',
dx=0.006125,
dy=0.006125,
xlogscale=False,
ylogscale=False,
xlabel='Log[Income per capita in ' + str(year) + ']',
ylabel='Log[Life Expectancy ' + str(year) + ']',
labels=False,
xpct = False,
ypct = False,
OLS=False,
OLSlinelabel='OLS',
ssline=False,
sslinelabel='45 Degree Line',
filename='income-pop-growth.pdf',
hue='region',
hue_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
style='incomeLevel',
style_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=None,
size=None,
sizes=None,
legend_fontsize=10,
label_font_size=12,
save=True):
'''
Plot the association between x and var in dataframe using labelvar for labels.
'''
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
df = dfin.copy()
df = df.dropna(subset=[x, y]).reset_index(drop=True)
# Plot
k = 0
fig, ax = plt.subplots()
sns.scatterplot(x=x, y=y, data=df, ax=ax,
hue=hue,
hue_order=hue_order,
#hue='incomeLevel',
#hue_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
#hue_order=['East Asia & Pacific', 'Europe & Central Asia',
# 'Latin America & Caribbean ', 'Middle East & North Africa',
# 'North America', 'South Asia', 'Sub-Saharan Africa '],
alpha=1,
style=style,
style_order=style_order,
palette=palette,
size=size,
sizes=sizes,
#palette=sns.color_palette("Blues_r", df[hue].unique().shape[0]+6)[:df[hue].unique().shape[0]*2:2],
)
if OLS:
sns.regplot(x=x, y=y, data=df, ax=ax, label=OLSlinelabel, scatter=False)
if ssline:
ax.plot([df[x].min()*.99, df[x].max()*1.01], [df[x].min()*.99, df[x].max()*1.01], c='r', label=sslinelabel)
if labels:
movex = df[x].mean() * dx
movey = df[y].mean() * dy
for line in range(0,df.shape[0]):
ax.text(df[x][line]+movex, df[y][line]+movey, df[labelvar][line], horizontalalignment='left', fontsize=label_font_size, color='black')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
if xpct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
xticks = mtick.FormatStrFormatter(fmt)
ax.xaxis.set_major_formatter(xticks)
if ypct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt)
ax.yaxis.set_major_formatter(yticks)
if ylogscale:
ax.set(yscale="log")
if xlogscale:
ax.set(xscale="log")
handles, labels = ax.get_legend_handles_labels()
handles = np.array(handles)
labels = np.array(labels)
handles = list(handles[(labels!=hue) & (labels!=style) & (labels!=size)])
labels = list(labels[(labels!=hue) & (labels!=style) & (labels!=size)])
ax.legend(handles=handles, labels=labels, fontsize=legend_fontsize)
if save:
plt.savefig(pathgraphs + filename, dpi=300, bbox_inches='tight')
return fig
g = my_xy_plot(dffig,
x='ln_gdp_pc',
y='ln_life_exp',
xlabel='Log[GDP per capita (' + str(year) +')]',
ylabel='Log[Life Expectancy (' + str(year) +')]',
OLS=True,
labels=True,
size="tfr",
sizes=(10, 400),
filename='ln-life-exp-ln-gdp-pc-gapminder.pdf')
g
Plot the evolution of variables across time by groups¶
def my_xy_line_plot(dfin,
x='year',
y='ln_gdp_pc',
labelvar='iso3c',
dx=0.006125,
dy=0.006125,
xlogscale=False,
ylogscale=False,
xlabel='Year',
ylabel='Log[Income per capita in ' + str(year) + ']',
labels=False,
xpct = False,
ypct = False,
OLS=False,
OLSlinelabel='OLS',
ssline=False,
sslinelabel='45 Degree Line',
filename='income-pop-growth.pdf',
hue='region',
hue_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
style='incomeLevel',
style_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=None,
legend_fontsize=10,
label_fontsize=12,
loc=None,
save=True):
'''
Plot the association between x and var in dataframe using labelvar for labels.
'''
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
df = dfin.copy()
df = df.dropna(subset=[x, y]).reset_index(drop=True)
# Plot
k = 0
fig, ax = plt.subplots()
sns.lineplot(x=x, y=y, data=df, ax=ax,
hue=hue,
hue_order=hue_order,
alpha=1,
style=style,
style_order=style_order,
palette=palette,
)
if OLS:
sns.regplot(x=x, y=y, data=df, ax=ax, label=OLSlinelabel, scatter=False)
if ssline:
ax.plot([df[x].min()*.99, df[x].max()*1.01], [df[x].min()*.99, df[x].max()*1.01], c='r', label=sslinelabel)
if labels:
movex = df[x].mean() * dx
movey = df[y].mean() * dy
for line in range(0,df.shape[0]):
ax.text(df[x][line]+movex, df[y][line]+movey, df[labelvar][line], horizontalalignment='left', fontsize=label_fontsize, color='black')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
if xpct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
xticks = mtick.FormatStrFormatter(fmt)
ax.xaxis.set_major_formatter(xticks)
if ypct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt)
ax.yaxis.set_major_formatter(yticks)
if ylogscale:
ax.set(yscale="log")
if xlogscale:
ax.set(xscale="log")
handles, labels = ax.get_legend_handles_labels()
handles = np.array(handles)
labels = np.array(labels)
handles = list(handles[(labels!='region') & (labels!='incomeLevel')])
labels = list(labels[(labels!='region') & (labels!='incomeLevel')])
ax.legend(handles=handles, labels=labels, fontsize=legend_fontsize, loc=loc)
if save:
plt.savefig(pathgraphs + filename, dpi=300, bbox_inches='tight')
return fig
Log[GDP per capita across the world] by WB Income Groups¶
palette=sns.color_palette("Blues_r", wdi['incomeLevel'].unique().shape[0]+6)[:wdi['incomeLevel'].unique().shape[0]*2:2]
fig = my_xy_line_plot(wdi,
x='year',
y='ln_gdp_pc',
xlabel='Year',
ylabel='Log[GDP per capita]',
filename='ln-gdp-pc-income-groups-TS-gapminder.pdf',
hue='incomeLevel',
hue_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=palette,
OLS=False,
labels=False,
legend_fontsize=16,
loc='lower right',
save=True)
fig
Log[Life Expectancy] across the world by WB Regions¶
#palette=sns.color_palette("Blues_r", wdi['region'].unique().shape[0]+6)[:wdi['region'].unique().shape[0]*2:2]
fig = my_xy_line_plot(wdi,
x='year',
y='ln_life_exp',
xlabel='Year',
ylabel='Log[Life Expectancy]',
ylogscale=True,
filename='ln-life-exp-regions-TS-gapminder.pdf',
style='region',
style_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
#palette=palette,
OLS=False,
labels=False,
legend_fontsize=12,
loc='lower right',
save=True)
fig
Plots with¶
url = 'https://plotly.com/python/'
IFrame(url, width=800, height=400)
Let's select symbols to plot so it looks like the previous ones and also to improve visibility¶
symbols = ['circle', 'x', 'square', 'cross', 'diamond', 'star-diamond', 'triangle-up']
fig = px.scatter(dffig,
x="ln_gdp_pc",
y="ln_life_exp",
color='region',
symbol='region',
symbol_sequence=symbols,
hover_name='name',
hover_data=['iso3c', 'ln_gdp_pc', 'ln_life_exp'],
size='tfr',
size_max=15,
trendline="ols",
trendline_scope="overall",
trendline_color_override="black",
labels={
"latitude": "Latitude",
"ln_gdp_pc": "Log[GDP per capita (" + str(year) + ")]",
"ln_life_exp": "Log[Life Expectancy (" + str(year) + ")]",
"region": "WB Region"
},
opacity=0.75,
height=800,
)
fig.show()
Change marker borders¶
fig.update_traces(marker=dict(#size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
Increase width of trend line¶
tr_line=[]
for k, trace in enumerate(fig.data):
if trace.mode is not None and trace.mode == 'lines':
tr_line.append(k)
print(tr_line)
for id in tr_line:
fig.data[id].update(line_width=3)
[7]
fig.show()
Change legend position¶
fig.update_layout(legend=dict(
yanchor="top",
y=0.25,
xanchor="left",
x=0.9
))
fig.show()
To save the figure use in your desired format¶
fig.write_image(pathgraphs + "fig1.pdf")
fig.write_image(pathgraphs + "fig1.png")
fig.write_image(pathgraphs + "fig1.jpg")
fig.write_image(pathgraphs + "ln-life-exp-ln-gdp-pc-plotly-gapminder.pdf", height=1000, width=1500, scale=4)
We can access the results of the regression in plotly express¶
results = px.get_trendline_results(fig)
results.px_fit_results.iloc[0].summary()
Dep. Variable: | y | R-squared: | 0.649 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.647 |
Method: | Least Squares | F-statistic: | 336.1 |
Date: | Thu, 22 Feb 2024 | Prob (F-statistic): | 3.29e-43 |
Time: | 09:02:51 | Log-Likelihood: | 237.38 |
No. Observations: | 184 | AIC: | -470.8 |
Df Residuals: | 182 | BIC: | -464.3 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 3.5419 | 0.040 | 89.072 | 0.000 | 3.463 | 3.620 |
x1 | 0.0781 | 0.004 | 18.334 | 0.000 | 0.070 | 0.087 |
Omnibus: | 61.936 | Durbin-Watson: | 2.062 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 167.770 |
Skew: | -1.422 | Prob(JB): | 3.71e-37 |
Kurtosis: | 6.714 | Cond. No. | 76.0 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Maps with¶
To create maps we need to obtain geographical information¶
There are various types of data in Geographic Information Systems (GIS)
- Location of cities, resources, etc. (point data)
- Shape of rivers, borders, countries, etc. (shape data)
- Numerical data for locations (elevation, temperature, number of people)
Download Country boundary data from Natural Earth¶
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip'
r = requests.get(url, headers=headers)
countries = gp.read_file(io.BytesIO(r.content))
countries.head()
featurecla | scalerank | LABELRANK | SOVEREIGNT | SOV_A3 | ADM0_DIF | LEVEL | TYPE | TLC | ADMIN | ... | FCLASS_TR | FCLASS_ID | FCLASS_PL | FCLASS_GR | FCLASS_IT | FCLASS_NL | FCLASS_SE | FCLASS_BD | FCLASS_UA | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Admin-0 country | 0 | 2 | Indonesia | IDN | 0 | 2 | Sovereign country | 1 | Indonesia | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | MULTIPOLYGON (((117.70361 4.16341, 117.70361 4... |
1 | Admin-0 country | 0 | 3 | Malaysia | MYS | 0 | 2 | Sovereign country | 1 | Malaysia | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | MULTIPOLYGON (((117.70361 4.16341, 117.69711 4... |
2 | Admin-0 country | 0 | 2 | Chile | CHL | 0 | 2 | Sovereign country | 1 | Chile | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... |
3 | Admin-0 country | 0 | 3 | Bolivia | BOL | 0 | 2 | Sovereign country | 1 | Bolivia | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... |
4 | Admin-0 country | 0 | 2 | Peru | PER | 0 | 2 | Sovereign country | 1 | Peru | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... |
5 rows × 169 columns
The boundary file is a geopandas dataframe¶
fig, ax = plt.subplots(figsize=(15,10))
countries.plot(ax=ax)
ax.set_title("WGS84 (lat/lon)", fontdict={'fontsize':34})
Text(0.5, 1.0, 'WGS84 (lat/lon)')
Merge with other data and plot¶
dffig2 = countries.merge(dffig, left_on='ADM0_A3', right_on='iso3c')
fig, ax = plt.subplots(figsize=(15,10))
dffig2.plot(column='life_exp', ax=ax, cmap='Reds')
ax.set_title("WGS84 (lat/lon)", fontdict={'fontsize':34})
Text(0.5, 1.0, 'WGS84 (lat/lon)')
url = 'https://residentmario.github.io/geoplot/'
IFrame(url, width=800, height=400)
Plot Countries¶
gplt.polyplot(
countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', facecolor='lightgray',
rasterized=True,
extent=[-180, -90, 180, 90],
)
<GeoAxes: >
Plot Data¶
gplt.choropleth(dffig2, hue='life_exp',
projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white',
linewidth=1,
cmap='Reds', legend=True,
scheme='FisherJenks',
legend_kwargs={'bbox_to_anchor':(0.23, 0.5),
'frameon': True,
'title':'Life Expectancy',
'fontsize':14,
'title_fontsize':14,
},
figsize=(12,8),
rasterized=True,
)
<GeoAxes: >
Data and Borders¶
ax = gplt.choropleth(dffig2, hue='life_exp', projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', linewidth=1,
cmap='Reds', legend=True,
scheme='FisherJenks',
legend_kwargs={'bbox_to_anchor':(0.23, 0.5),
'frameon': True,
'title':'Life Expectancy',
'fontsize':14,
'title_fontsize':14,
},
figsize=(15,10),
rasterized=True,
)
gplt.polyplot(countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', facecolor='lightgray',
ax=ax,
rasterized=True,
extent=[-180, -90, 180, 90],
)
<GeoAxes: >
Use a nice function¶
# Functions for plotting
def center_wrap(text, cwidth=32, **kw):
'''Center Text (to be used in legend)'''
lines = text
#lines = textwrap.wrap(text, **kw)
return "\n".join(line.center(cwidth) for line in lines)
def MyChoropleth(mydf,
myfile='',
myvar='',
mylegend='',
k=5,
extent=[-180, -90, 180, 90],
figsize=(24, 16),
bbox_to_anchor=(0.2, 0.5),
fontsize=12,
title_fontsize=12,
edgecolor='white',
facecolor='lightgray',
scheme='FisherJenks',
bins=None,
pct=None,
legend=True,
legend_labels=None,
legend_kwargs=None,
legend_values=None,
bold=True,
rasterized=False,
save=True,
percent=False,
rn=0,
cmap='Reds',
reverse=False,
**kwargs):
# Choropleth
# Color scheme
if scheme=='EqualInterval':
scheme = mc.EqualInterval(mydf[myvar], k=k)
elif scheme=='Quantiles':
scheme = mc.Quantiles(mydf[myvar], k=k)
elif scheme=='BoxPlot':
scheme = mc.BoxPlot(mydf[myvar], k=k)
elif scheme=='FisherJenks':
scheme = mc.FisherJenks(mydf[myvar], k=k)
elif scheme=='FisherJenksSampled':
scheme = mc.FisherJenksSampled(mydf[myvar], k=k)
elif scheme=='HeadTailBreaks':
scheme = mc.HeadTailBreaks(mydf[myvar], k=k)
elif scheme=='JenksCaspall':
scheme = mc.JenksCaspall(mydf[myvar], k=k)
elif scheme=='JenksCaspallForced':
scheme = mc.JenksCaspallForced(mydf[myvar], k=k)
elif scheme=='JenksCaspallSampled':
scheme = mc.JenksCaspallSampled(mydf[myvar], k=k)
elif scheme=='KClassifiers':
scheme = mc.KClassifiers(mydf[myvar], k=k)
elif scheme=='Percentiles':
scheme = mc.Percentiles(mydf[myvar], pct=pct)
elif scheme=='UserDefined':
scheme = mc.UserDefined(mydf[myvar], bins=bins)
if legend:
legend_kwargs = {'bbox_to_anchor': bbox_to_anchor,
'frameon': True,
'title':mylegend,
'fontsize':fontsize,
'title_fontsize':title_fontsize,
}
if legend_labels is None:
# Format legend
upper_bounds = scheme.bins
# get and format all bounds
bounds = []
for index, upper_bound in enumerate(upper_bounds):
if index == 0:
lower_bound = mydf[myvar].min()
else:
lower_bound = upper_bounds[index-1]
# format the numerical legend here
if percent:
bound = f'{lower_bound:.{rn}%} - {upper_bound:.{rn}%}'.format(width=rn)
else:
bound = f'{float(lower_bound):,.{rn}f} - {float(upper_bound):,.{rn}f}'.format(width=rn)
bounds.append(bound)
legend_labels = bounds
# Reverse the order of the legend labels
if reverse:
legend_labels = legend_labels[::-1]
legend_values = list(range(len(legend_labels)))[::-1]
#Plot
ax = gplt.choropleth(
mydf, hue=myvar, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', linewidth=1,
cmap=cmap, legend=legend,
scheme=scheme,
legend_kwargs=legend_kwargs,
legend_labels = legend_labels,
legend_values=legend_values,
figsize=figsize,
rasterized=rasterized,
)
gplt.polyplot(
countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor=edgecolor, facecolor=facecolor,
ax=ax,
rasterized=rasterized,
extent=extent,
)
# Make the legend title bold
if bold:
ax.get_legend().get_title().set_fontweight('bold')
if save:
#plt.savefig(pathgraphs + myfile + '_' + myvar.replace(' ', '_') +'.pdf', dpi=300, bbox_inches='tight')
plt.savefig(pathgraphs + myfile + '_' + myvar.replace(' ', '_') +'.jpg', dpi=300, bbox_inches='tight')
return ax
mylegend = center_wrap(["Life Expectancy in " + str(year)], cwidth=32, width=32)
MyChoropleth(dffig2, myfile='fig-gapminder-life-exp-' + str(year),
myvar='life_exp',
mylegend=mylegend,
k=10,
fontsize=16,
title_fontsize=16,
reverse=True,
scheme='Quantiles',
save=True)
<GeoAxes: >
Quick and Easy Maps with¶
url = 'https://plotly.com/python/maps/'
IFrame(url, width=800, height=400)
Map using classes (similar to geoplot)¶
Choose a classifier and classify the data¶
scheme = mc.Quantiles(dffig2['life_exp'], k=8)
classifier = mc.Quantiles.make(k=8, rolling=True)
dffig2['life_exp_q'] = classifier(dffig2['life_exp'])
dffig2['life_exp_qc'] = dffig2['life_exp_q'].apply(lambda x: scheme.get_legend_classes()[x].replace('[ ', '[').replace('( ', '('))
fig = px.choropleth(dffig2.sort_values('life_exp_q', ascending=False),
locations="iso3c",
color="life_exp_qc",
hover_name='name',
hover_data=['iso3c', 'tfr'],
labels={
"life_exp_qc": "Life Expectancy (" + str(year) + ")",
},
color_discrete_sequence=px.colors.sequential.Reds_r,
height=600,
width=1000,
)
# Change legend position
fig.update_layout(legend=dict(
yanchor="bottom",
y=0.15,
xanchor="left",
x=0.05
))
/Users/ozak/anaconda3/envs/EconGrowthUG/lib/python3.9/site-packages/plotly/express/_core.py:2065: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
fig.show()
fig = px.choropleth(dffig2.sort_values('life_exp_q', ascending=False),
locations="iso3c",
color="life_exp_qc",
hover_name='name',
hover_data=['iso3c', 'life_exp' ,'tfr'],
labels={
"life_exp_qc": "Life Expectancy (" + str(year) + ")",
"life_exp": "Life Expectancy (" + str(year) + ")",
'iso3c':'ISO code',
"tfr": "Total Fertility Rate (" + str(year) + ")]",
},
color_discrete_sequence=px.colors.sequential.Blues_r,
height=600,
width=1000,
)
# Change legend position
fig.update_layout(legend=dict(
yanchor="bottom",
y=0.15,
xanchor="left",
x=0.05
))
/Users/ozak/anaconda3/envs/EconGrowthUG/lib/python3.9/site-packages/plotly/express/_core.py:2065: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
fig.show()
fig = px.choropleth(dffig,
locations="iso3c",
color="life_exp",
hover_name='name',
hover_data=['iso3c', 'tfr'],
labels={
"life_exp": "Life Expectancy (" + str(year) + ")]",
"tfr": "Total Fertility Rate"
},
#color_continuous_scale=px.colors.sequential.Plasma,
color_continuous_scale="Reds",
height=600,
width=1100,
)
fig.show()
fig.update_layout(coloraxis_colorbar=dict(
orientation = 'h',
yanchor="bottom",
xanchor="left",
y=-.2,
x=0,
))
fig.update_coloraxes(colorbar_title_side='top')
fig.show()
# Change legend position
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="center",
x=0.01,
orientation='h',
))
fig.show()
fig = go.Figure(data=go.Choropleth(
locations = dffig['iso3c'],
z = dffig['life_exp'],
text = dffig['name'],
colorscale = 'Blues',
autocolorscale=False,
reversescale=False,
marker_line_color='darkgray',
marker_line_width=0.5,
#colorbar_tickprefix = '$',
colorbar_title = 'Life Expectancy',
)
)
fig.update_layout(
autosize=False,
width=800,
height=400,
margin=dict(
l=5,
r=5,
b=10,
t=10,
pad=1
),
paper_bgcolor="LightSteelBlue",
)
fig.show()
fig = go.Figure(data=go.Choropleth(
locations = dffig['iso3c'],
z = dffig['life_exp'],
text = dffig['name'],
colorscale = 'Blues',
autocolorscale=False,
reversescale=False,
marker_line_color='darkgray',
marker_line_width=0.5,
#colorbar_tickprefix = '$',
colorbar_title = 'Life Expectancy',
)
)
fig.update_layout(
autosize=False,
width=1000,
height=600,
margin=dict(
l=1,
r=1,
b=1,
t=1,
pad=.1
),
paper_bgcolor="LightSteelBlue",
)
# Change legend position
cb = fig.data[0].colorbar
cb.orientation = 'h'
cb.yanchor = 'bottom'
cb.xanchor = 'center'
cb.y = .1
cb.title.side = 'top'
fig.show()
Exercises ¶
MyChoropleth
function map the global spatial distribution of TFR, CBR, and CDR in the years 1990, 1995, 2000, 2010, and the last year in the sample.
my_xy_plot
function plot the relation between GDP per capita and Life Expectancy, TFR, CBR, and CDR in the years 1990, 1995, 2000, 2010, 2020.
my_xy_line_plot
function plot the evolution of Life Expectancy, TFR, CBR, and CDR by income groups and regions (separate figures).
Notebook written by Ömer Özak for his students in Economics at Southern Methodist University. Feel free to use, distribute, or contribute.