GIS with and ¶
Part III: Data Munging...Combining GIS with Other Tools¶
Set-up our environment as before¶
Let's import the packages we will use and set the paths for outputs.
# Let's import pandas and some other basic packages we will use
from __future__ import division
import pandas as pd
import numpy as np
import os, sys
# GIS packages
import geopandas as gpd
from geopandas.tools import overlay
from shapely.geometry import Polygon, Point
import georasters as gr
# Alias for Geopandas
gp = gpd
# Plotting
import matplotlib as mpl
import seaborn as sns
# Setup seaborn
sns.set()
# Mapping
import geoplot as gplt
import geoplot.crs as gcrs
import mapclassify as mc
import textwrap
%pylab --no-import-all
%matplotlib inline
Using matplotlib backend: <object object at 0x1197f19d0> %pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
# Functions for plotting
def center_wrap(text, cwidth=32, **kw):
'''Center Text (to be used in legend)'''
lines = text
#lines = textwrap.wrap(text, **kw)
return "\n".join(line.center(cwidth) for line in lines)
def MyChoropleth(mydf, myfile='', myvar='',
mylegend='',
k=5,
extent=[-180, -90, 180, 90],
bbox_to_anchor=(0.2, 0.5),
edgecolor='white', facecolor='lightgray',
scheme='FisherJenks', bins=None, pct=None,
legend_labels=None,
save=True,
percent=False,
cmap='Reds',
**kwargs):
# Chloropleth
# Color scheme
if scheme=='EqualInterval':
scheme = mc.EqualInterval(mydf[myvar], k=k)
elif scheme=='Quantiles':
scheme = mc.Quantiles(mydf[myvar], k=k)
elif scheme=='BoxPlot':
scheme = mc.BoxPlot(mydf[myvar], k=k)
elif scheme=='FisherJenks':
scheme = mc.FisherJenks(mydf[myvar], k=k)
elif scheme=='FisherJenksSampled':
scheme = mc.FisherJenksSampled(mydf[myvar], k=k)
elif scheme=='HeadTailBreaks':
scheme = mc.HeadTailBreaks(mydf[myvar], k=k)
elif scheme=='JenksCaspall':
scheme = mc.JenksCaspall(mydf[myvar], k=k)
elif scheme=='JenksCaspallForced':
scheme = mc.JenksCaspallForced(mydf[myvar], k=k)
elif scheme=='JenksCaspallSampled':
scheme = mc.JenksCaspallSampled(mydf[myvar], k=k)
elif scheme=='KClassifiers':
scheme = mc.KClassifiers(mydf[myvar], k=k)
elif scheme=='Percentiles':
scheme = mc.Percentiles(mydf[myvar], pct=pct)
elif scheme=='UserDefined':
scheme = mc.UserDefined(mydf[myvar], bins=bins)
if legend_labels is None:
# Format legend
upper_bounds = scheme.bins
# get and format all bounds
bounds = []
for index, upper_bound in enumerate(upper_bounds):
if index == 0:
lower_bound = mydf[myvar].min()
else:
lower_bound = upper_bounds[index-1]
# format the numerical legend here
if percent:
bound = f'{lower_bound:.0%} - {upper_bound:.0%}'
else:
bound = f'{float(lower_bound):,.0f} - {float(upper_bound):,.0f}'
bounds.append(bound)
legend_labels = bounds
#Plot
ax = gplt.choropleth(
mydf, hue=myvar, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', linewidth=1,
cmap=cmap, legend=True,
scheme=scheme,
legend_kwargs={'bbox_to_anchor': bbox_to_anchor,
'frameon': True,
'title':mylegend,
},
legend_labels = legend_labels,
figsize=(24, 16),
rasterized=True,
)
gplt.polyplot(
countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor=edgecolor, facecolor=facecolor,
ax=ax,
rasterized=True,
extent=extent,
)
if save:
plt.savefig(pathgraphs + myfile + '_' + myvar +'.pdf', dpi=300, bbox_inches='tight')
plt.savefig(pathgraphs + myfile + '_' + myvar +'.png', dpi=300, bbox_inches='tight')
pass
# Paths
pathout = './data/'
if not os.path.exists(pathout):
os.mkdir(pathout)
pathgraphs = './graphs/'
if not os.path.exists(pathgraphs):
os.mkdir(pathgraphs)
Let's plot the countries for which Colombian citizens do not require visas¶
The Colombian Cancillery's website has a list with visa requirements for colombians.
Let's use it to map countries for which visas are not required.
Below is the link to the information.
The problem is that it is a pdf file. Let's open the website and check it out
# Import display options for showing websites
from IPython.display import IFrame
url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
IFrame(url, width=800, height=400)
Roadblock¶
Someone forgot to make our life easy and made the data available in a pdf
.
Luckily python
has tools to deal with this.¶
So let's download it, save it to disk and use these tools to process the pdf into a pandas.DataFrame
.
# Import package for downloading internet content and save it to file
import requests
url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
response = requests.get(url)
with open(pathout + 'visas.pdf', 'wb') as f:
f.write(response.content)
# Import package to read pdf tables
import camelot
visas = camelot.read_pdf(pathout + 'visas.pdf', pages='1-7')
Let's explore the visas object
visas
<TableList n=7>
So there are 7 tables in visas. What does Table 1 have?
visas[0]
<Table shape=(28, 3)>
visas[0].df
0 | 1 | 2 | |
---|---|---|---|
0 | MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA | ||
1 | DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y... | ||
2 | COORDINACION DE VISAS E INMIGRACION | ||
3 | Estados y territorios que exigen o NO visas a ... | ||
4 | EXIGEN VISA A | ||
5 | PAIS | SI | NO |
6 | Afganistán | X | |
7 | Albania | X | |
8 | Alemania | X | |
9 | Andorra | X | |
10 | Angola | X | |
11 | Antigua y Barbuda | X | |
12 | Arabia Saudita | X | |
13 | Argelia | X | |
14 | Argentina | X | |
15 | Armenia | ||
16 | Australia | X X | |
17 | Austria | X | |
18 | Azerbaiyán | X (Visa electrónica) | |
19 | Bahamas | X | |
20 | Bahréin | X (visa a la llegada y visa electrónica) | |
21 | Bangladesh | X | |
22 | Barbados | X | |
23 | Bélgica | X | |
24 | Belice | X | |
25 | Benin | ||
26 | Belarús | X X | |
27 | Bolivia | X |
Ok, let's concatenate all these pandas
dataframes.
visadf = pd.concat([i.df for i in visas]).reset_index(drop=True)
visadf
0 | 1 | 2 | |
---|---|---|---|
0 | MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA | ||
1 | DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y... | ||
2 | COORDINACION DE VISAS E INMIGRACION | ||
3 | Estados y territorios que exigen o NO visas a ... | ||
4 | EXIGEN VISA A | ||
... | ... | ... | ... |
220 | Taiwan | X Visa electrónica | |
221 | Wallis y Futuna (Francia) | X | |
222 | |||
223 | Actualización 21 -10-2019 | ||
224 | El presente cuadro presenta generalidades sobr... |
225 rows × 3 columns
We need to correct the header¶
visadf.columns = visadf.iloc[5]
visadf.head(10)
5 | PAIS | SI | NO |
---|---|---|---|
0 | MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA | ||
1 | DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y... | ||
2 | COORDINACION DE VISAS E INMIGRACION | ||
3 | Estados y territorios que exigen o NO visas a ... | ||
4 | EXIGEN VISA A | ||
5 | PAIS | SI | NO |
6 | Afganistán | X | |
7 | Albania | X | |
8 | Alemania | X | |
9 | Andorra | X |
Let's remove the first 6 rows
visadf = visadf.iloc[6:].copy()
Rename the columns
visadf.columns.name = ''
visadf.head(10)
PAIS | SI | NO | |
---|---|---|---|
6 | Afganistán | X | |
7 | Albania | X | |
8 | Alemania | X | |
9 | Andorra | X | |
10 | Angola | X | |
11 | Antigua y Barbuda | X | |
12 | Arabia Saudita | X | |
13 | Argelia | X | |
14 | Argentina | X | |
15 | Armenia |
Let's code SI (YES) as 1 and NO as 0
visadf['visa_req'] = visadf.SI.map({'X':1, '':0})
Let's check whether things were mapped correctly
visadf.loc[visadf.visa_req.isna()]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
16 | Australia | X X | NaN | |
18 | Azerbaiyán | X (Visa electrónica) | NaN | |
20 | Bahréin | X (visa a la llegada y visa electrónica) | NaN | |
26 | Belarús | X X | NaN | |
34 | Burundi | X X X | NaN | |
36 | Cabo Verde | X (Visa a la llegada) | NaN | |
37 | Camboya | X (Visa a la llegada) | NaN | |
39 | Canadá | X X X | NaN | |
46 | Congo | X X X | NaN | |
50 | Costa de Marfil | X X | NaN | |
58 | Egipto | X (Visa a la llegada) | NaN | |
68 | Fiji | X X | NaN | |
76 | Granada | X X | NaN | |
80 | Guinea-Bissau | X X X | NaN | |
88 | Irán | X X X X X | NaN | |
93 | Islas Salomón | X X | NaN | |
98 | Jordania | X X | NaN | |
100 | Kenia | X Visa a la llegada | NaN | |
102 | Kiribati | X X | NaN | |
105 | Laos República Democrática P | X Visa a la llegada | NaN | |
110 | Libia | X X | NaN | |
116 | Malasia | X X X | NaN | |
122 | Mauricio | X X X | NaN | |
131 | Myanmar | X (Visa a la llegada) | NaN | |
135 | Nicaragua | X (visa a la llegada para titulares de visa de... | NaN | |
137 | Nigeria | X X | NaN | |
140 | Omán | X (Visa de turismo al ingreso a Omán en los pu... | NaN | |
143 | Palau | X X | NaN | |
156 | Ruanda | X (Visa electrónica) | NaN | |
167 | Sierra Leona | X X | NaN | |
172 | Sudáfrica | X X X X X X | NaN | |
179 | Tailandia | X X | NaN | |
180 | Tanzania | X Visa a la llegada | NaN | |
183 | Togo | X X X X | NaN | |
194 | Vanuatu | X X | NaN | |
197 | Yemen | X X X | NaN | |
207 | Macao (SARG-China) (*) | X Visa a la llegada | NaN | |
220 | Taiwan | X Visa electrónica | NaN |
IFrame(url, width=800, height=400)
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(1)=='X X') | (visadf.SI.shift(-1)=='X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
15 | Armenia | 0.0 | ||
16 | Australia | X X | NaN | |
17 | Austria | X | 0.0 | |
25 | Benin | 0.0 | ||
26 | Belarús | X X | NaN | |
27 | Bolivia | X | 0.0 | |
49 | Corea República Popular Dem. | 0.0 | ||
50 | Costa de Marfil | X X | NaN | |
51 | Costa Rica | X | A titulares de Visa de EE UU o Schengen vigen... | 1.0 |
67 | EtiopÃa | 0.0 | ||
68 | Fiji | X X | NaN | |
69 | Filipinas | X Hasta por 30 dÃas | 0.0 | |
75 | Ghana | 0.0 | ||
76 | Granada | X X | NaN | |
77 | Grecia | X | 0.0 | |
92 | Islas Marshall | 0.0 | ||
93 | Islas Salomón | X X | NaN | |
94 | Israel | X | 0.0 | |
97 | Japón | 0.0 | ||
98 | Jordania | X X | NaN | |
99 | Kazajstán | X (Hasta por 30 dÃas) | 0.0 | |
101 | Kirguistán | 0.0 | ||
102 | Kiribati | X X | NaN | |
103 | Kuwait | X | 1.0 | |
109 | Liberia | 0.0 | ||
110 | Libia | X X | NaN | |
111 | Liechtenstein | X | 0.0 | |
136 | NÃger | 0.0 | ||
137 | Nigeria | X X | NaN | |
138 | Noruega | X | 0.0 | |
142 | Pakistán | 0.0 | ||
143 | Palau | X X | NaN | |
144 | Panamá | X | 0.0 | |
166 | Seychelles | 0.0 | ||
167 | Sierra Leona | X X | NaN | |
168 | Singapur | X Hasta por 30 dÃas | 0.0 | |
178 | Suazilandia | 0.0 | ||
179 | Tailandia | X X | NaN | |
180 | Tanzania | X Visa a la llegada | NaN | |
193 | Uzbekistán | 0.0 | ||
194 | Vanuatu | X X | NaN | |
195 | Venezuela | X | 0.0 |
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
33 | Burkina Faso | 0.0 | ||
34 | Burundi | X X X | NaN | |
35 | Bután | 0.0 | ||
38 | Camerún | 0.0 | ||
39 | Canadá | X X X | NaN | |
40 | Chad | 0.0 | ||
45 | Comoras | 0.0 | ||
46 | Congo | X X X | NaN | |
47 | Congo República Democrática | 0.0 | ||
79 | Guinea | 0.0 | ||
80 | Guinea-Bissau | X X X | NaN | |
81 | Guinea Ecuatorial | 0.0 | ||
115 | Madagascar | 0.0 | ||
116 | Malasia | X X X | NaN | |
117 | Malawi | 0.0 | ||
121 | Marruecos | 0.0 | ||
122 | Mauricio | X X X | NaN | |
123 | Mauritania | 0.0 | ||
196 | Vietnam | 0.0 | ||
197 | Yemen | X X X | NaN | |
198 | Zambia | 0.0 |
Ok it seems we have two types of errors.
- First, notice that sometimes the type of visa is defined, e.g., Azerbayán.
Second, the OCR software has mixed some rows, so that now we have XX, XXX, etc.
Looking at the pdf it seems this is due to assigning an X from a previous row to the current row ("X X") or from both the previous and next ("X X X").
Let's try to correct these errors programatically (obviously sometimes it may just be faster and better to export the dataframe, correct it by hand and then load the corrected one, but we're here to learn, right?).
First, let's replace the repeated X with what seems to be the correct data.
X X¶
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
15 | Armenia | 1.0 | ||
16 | Australia | X X | 1.0 | |
25 | Benin | 1.0 | ||
26 | Belarús | X X | 1.0 | |
49 | Corea República Popular Dem. | 1.0 | ||
50 | Costa de Marfil | X X | 1.0 | |
67 | EtiopÃa | 1.0 | ||
68 | Fiji | X X | 1.0 | |
75 | Ghana | 1.0 | ||
76 | Granada | X X | 1.0 | |
92 | Islas Marshall | 1.0 | ||
93 | Islas Salomón | X X | 1.0 | |
97 | Japón | 1.0 | ||
98 | Jordania | X X | 1.0 | |
101 | Kirguistán | 1.0 | ||
102 | Kiribati | X X | 1.0 | |
109 | Liberia | 1.0 | ||
110 | Libia | X X | 1.0 | |
136 | NÃger | 1.0 | ||
137 | Nigeria | X X | 1.0 | |
142 | Pakistán | 1.0 | ||
143 | Palau | X X | 1.0 | |
166 | Seychelles | 1.0 | ||
167 | Sierra Leona | X X | 1.0 | |
178 | Suazilandia | 1.0 | ||
179 | Tailandia | X X | 1.0 | |
193 | Uzbekistán | 1.0 | ||
194 | Vanuatu | X X | 1.0 |
X X X¶
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X'), 'visa_req'] =1
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
33 | Burkina Faso | 1.0 | ||
34 | Burundi | X X X | 1.0 | |
35 | Bután | 1.0 | ||
38 | Camerún | 1.0 | ||
39 | Canadá | X X X | 1.0 | |
40 | Chad | 1.0 | ||
45 | Comoras | 1.0 | ||
46 | Congo | X X X | 1.0 | |
47 | Congo República Democrática | 1.0 | ||
79 | Guinea | 1.0 | ||
80 | Guinea-Bissau | X X X | 1.0 | |
81 | Guinea Ecuatorial | 1.0 | ||
115 | Madagascar | 1.0 | ||
116 | Malasia | X X X | 1.0 | |
117 | Malawi | 1.0 | ||
121 | Marruecos | 1.0 | ||
122 | Mauricio | X X X | 1.0 | |
123 | Mauritania | 1.0 | ||
196 | Vietnam | 1.0 | ||
197 | Yemen | X X X | 1.0 | |
198 | Zambia | 1.0 |
X X X X¶
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(2)=='X X X X') | (visadf.SI.shift(-2)=='X X X X') | (visadf.SI.shift(-3)=='X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
180 | Tanzania | X Visa a la llegada | NaN | |
181 | Tayikistán | 0.0 | ||
182 | Timor Oriental | 0.0 | ||
183 | Togo | X X X X | NaN | |
184 | Tonga | 0.0 | ||
185 | Trinidad y Tobago | X | 0.0 |
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
181 | Tayikistán | 1.0 | ||
182 | Timor Oriental | 1.0 | ||
183 | Togo | X X X X | 1.0 | |
184 | Tonga | 1.0 |
X X X X X¶
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
86 | India | 0.0 | ||
87 | Indonesia | 0.0 | ||
88 | Irán | X X X X X | NaN | |
89 | Iraq | 0.0 | ||
90 | Irlanda | 0.0 |
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
86 | India | 1.0 | ||
87 | Indonesia | 1.0 | ||
88 | Irán | X X X X X | 1.0 | |
89 | Iraq | 1.0 | ||
90 | Irlanda | 1.0 |
X X X X X X¶
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X') | (visadf.SI.shift(3)=='X X X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
169 | Siria | 0.0 | ||
170 | Somalia | 0.0 | ||
171 | Sri Lanka | 0.0 | ||
172 | Sudáfrica | X X X X X X | NaN | |
173 | Sudán del Sur | 0.0 | ||
174 | Sudán | 0.0 | ||
175 | Suecia | X | 0.0 |
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
169 | Siria | 1.0 | ||
170 | Somalia | 1.0 | ||
171 | Sri Lanka | 1.0 | ||
172 | Sudáfrica | X X X X X X | 1.0 | |
173 | Sudán del Sur | 1.0 | ||
174 | Sudán | 1.0 |
Let's also replace visa required for any row that has the word "visa".
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
18 | Azerbaiyán | X (Visa electrónica) | NaN | |
20 | Bahréin | X (visa a la llegada y visa electrónica) | NaN | |
36 | Cabo Verde | X (Visa a la llegada) | NaN | |
37 | Camboya | X (Visa a la llegada) | NaN | |
58 | Egipto | X (Visa a la llegada) | NaN | |
100 | Kenia | X Visa a la llegada | NaN | |
105 | Laos República Democrática P | X Visa a la llegada | NaN | |
131 | Myanmar | X (Visa a la llegada) | NaN | |
135 | Nicaragua | X (visa a la llegada para titulares de visa de... | NaN | |
140 | Omán | X (Visa de turismo al ingreso a Omán en los pu... | NaN | |
156 | Ruanda | X (Visa electrónica) | NaN | |
180 | Tanzania | X Visa a la llegada | NaN | |
207 | Macao (SARG-China) (*) | X Visa a la llegada | NaN | |
220 | Taiwan | X Visa electrónica | NaN |
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1, 'visa_req'] = 1
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
18 | Azerbaiyán | X (Visa electrónica) | 1.0 | |
20 | Bahréin | X (visa a la llegada y visa electrónica) | 1.0 | |
36 | Cabo Verde | X (Visa a la llegada) | 1.0 | |
37 | Camboya | X (Visa a la llegada) | 1.0 | |
58 | Egipto | X (Visa a la llegada) | 1.0 | |
100 | Kenia | X Visa a la llegada | 1.0 | |
105 | Laos República Democrática P | X Visa a la llegada | 1.0 | |
131 | Myanmar | X (Visa a la llegada) | 1.0 | |
135 | Nicaragua | X (visa a la llegada para titulares de visa de... | 1.0 | |
140 | Omán | X (Visa de turismo al ingreso a Omán en los pu... | 1.0 | |
156 | Ruanda | X (Visa electrónica) | 1.0 | |
180 | Tanzania | X Visa a la llegada | 1.0 | |
207 | Macao (SARG-China) (*) | X Visa a la llegada | 1.0 | |
220 | Taiwan | X Visa electrónica | 1.0 |
Let's check again¶
visadf.loc[visadf.visa_req.isna()]
PAIS | SI | NO | visa_req |
---|
Ok, it seems we have coded which countries need and which do not need visa for colombian citizens. Let's analyze this data a bit.
Recode visa requirements as Yes/No¶
visadf['visa_req_YN'] = visadf.visa_req.map({0:'NO', 1:'YES'})
visadf
PAIS | SI | NO | visa_req | visa_req_YN | |
---|---|---|---|---|---|
6 | Afganistán | X | 1.0 | YES | |
7 | Albania | X | 0.0 | NO | |
8 | Alemania | X | 0.0 | NO | |
9 | Andorra | X | 0.0 | NO | |
10 | Angola | X | 1.0 | YES | |
... | ... | ... | ... | ... | ... |
220 | Taiwan | X Visa electrónica | 1.0 | YES | |
221 | Wallis y Futuna (Francia) | X | 0.0 | NO | |
222 | 0.0 | NO | |||
223 | Actualización 21 -10-2019 | 0.0 | NO | ||
224 | El presente cuadro presenta generalidades sobr... | 0.0 | NO |
219 rows × 5 columns
visadf.hist()
visadf.visa_req.describe()
count 219.000000 mean 0.547945 std 0.498836 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: visa_req, dtype: float64
df = visadf.groupby('visa_req_YN').count().reset_index()
df
visa_req_YN | PAIS | SI | NO | visa_req | |
---|---|---|---|---|---|
0 | NO | 99 | 99 | 99 | 99 |
1 | YES | 120 | 120 | 120 | 120 |
sns.set(rc={'figure.figsize':(11.7,8.27)})
#sns.reset_orig()
sns.set_context("talk")
# Plot
fig, ax = plt.subplots()
sns.barplot(x='visa_req_YN', y='visa_req', data=df, alpha=1)
ax.tick_params(axis = 'both', which = 'major')
ax.tick_params(axis = 'both', which = 'minor')
ax.set_xlabel('Visa Required')
ax.set_ylabel('Number of Countries')
Text(0, 0.5, 'Number of Countries')
Let's try to map these countries. First let's get the Natural Earth shapefile.
import requests
import io
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
url = 'https://naturalearth.s3.amazonaws.com/10m_cultural/ne_10m_admin_0_countries.zip'
r = requests.get(url, headers=headers)
countries = gp.read_file(io.BytesIO(r.content))
#countries = gpd.read_file('https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip')
countries
featurecla | scalerank | LABELRANK | SOVEREIGNT | SOV_A3 | ADM0_DIF | LEVEL | TYPE | TLC | ADMIN | ... | FCLASS_TR | FCLASS_ID | FCLASS_PL | FCLASS_GR | FCLASS_IT | FCLASS_NL | FCLASS_SE | FCLASS_BD | FCLASS_UA | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Admin-0 country | 0 | 2 | Indonesia | IDN | 0 | 2 | Sovereign country | 1 | Indonesia | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((117.70361 4.16341, 117.70361 4... |
1 | Admin-0 country | 0 | 3 | Malaysia | MYS | 0 | 2 | Sovereign country | 1 | Malaysia | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((117.70361 4.16341, 117.69711 4... |
2 | Admin-0 country | 0 | 2 | Chile | CHL | 0 | 2 | Sovereign country | 1 | Chile | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... |
3 | Admin-0 country | 0 | 3 | Bolivia | BOL | 0 | 2 | Sovereign country | 1 | Bolivia | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... |
4 | Admin-0 country | 0 | 2 | Peru | PER | 0 | 2 | Sovereign country | 1 | Peru | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
253 | Admin-0 country | 0 | 4 | China | CH1 | 1 | 2 | Country | 1 | Macao S.A.R | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((113.55860 22.16303, 113.56943 ... |
254 | Admin-0 country | 6 | 5 | Australia | AU1 | 1 | 2 | Dependency | 1 | Ashmore and Cartier Islands | ... | None | None | None | None | None | None | None | None | None | POLYGON ((123.59702 -12.42832, 123.59775 -12.4... |
255 | Admin-0 country | 6 | 8 | Bajo Nuevo Bank (Petrel Is.) | BJN | 0 | 2 | Indeterminate | 1 | Bajo Nuevo Bank (Petrel Is.) | ... | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | POLYGON ((-79.98929 15.79495, -79.98782 15.796... |
256 | Admin-0 country | 6 | 5 | Serranilla Bank | SER | 0 | 2 | Indeterminate | 1 | Serranilla Bank | ... | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | POLYGON ((-78.63707 15.86209, -78.64041 15.864... |
257 | Admin-0 country | 6 | 6 | Scarborough Reef | SCR | 0 | 2 | Indeterminate | 1 | Scarborough Reef | ... | None | None | None | None | None | None | None | None | None | POLYGON ((117.75389 15.15437, 117.75569 15.151... |
258 rows × 169 columns
Luckily there are country names in Spanish. Let's see if we can merge these two data sets.
countries.NAME_ES
0 Indonesia 1 Malasia 2 Chile 3 Bolivia 4 Perú ... 253 Macao 254 Islas Ashmore y Cartier 255 Bajo Nuevo 256 Isla Serranilla 257 Bajo de Masinloc Name: NAME_ES, Length: 258, dtype: object
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChoropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
save=False)
So it seems not everything merged correctly
col_visa.shape
(164, 174)
visadf.shape
(219, 5)
col_visa.loc[col_visa.visa_req.isna(), 'NAME_ES'].sort_values()
Series([], Name: NAME_ES, dtype: object)
So we are not linking all countries.
This is usually due to symbols like accents and ~, but in this case also because the tail of the data frame includes territories of countries, so their names are non-standard (and OCR may have made some mistakes).
visadf.tail(25)
PAIS | SI | NO | visa_req | visa_req_YN | |
---|---|---|---|---|---|
200 | OTROS TERRITORIOS | 0.0 | NO | ||
201 | Aruba (PaÃses Bajos) | X | 0.0 | NO | |
202 | Bonaire (PaÃses Bajos) | X | 0.0 | NO | |
203 | Curazao (PaÃses Bajos) | X | 0.0 | NO | |
204 | Guadalupe (Francia) | X | 0.0 | NO | |
205 | Guyana Francesa | X | 0.0 | NO | |
206 | Hong Kong (SARG-China) | X Por 90 dÃas | 0.0 | NO | |
207 | Macao (SARG-China) (*) | X Visa a la llegada | 1.0 | YES | |
208 | Martinica (Francia) | X | 0.0 | NO | |
209 | Mayotte (Francia) | X | 0.0 | NO | |
210 | Nueva Caledonia (Francia) | X | 0.0 | NO | |
211 | Palestina | X | 1.0 | YES | |
212 | Polinesia Francesa | X | 0.0 | NO | |
213 | Réunion (Francia) | X | 0.0 | NO | |
214 | Saba (PaÃses Bajos) | X | 0.0 | NO | |
215 | Saint Barthélémy (Francia) | X | 1.0 | YES | |
216 | Saint Pïerre et Miquelon (Francia) | X | 0.0 | NO | |
217 | Saint Martin (Francia) | X | 1.0 | YES | |
218 | Sint Maarten (PaÃses Bajos) | X | 0.0 | NO | |
219 | Sint Eustatius (PaÃses Bajos) | X | 0.0 | NO | |
220 | Taiwan | X Visa electrónica | 1.0 | YES | |
221 | Wallis y Futuna (Francia) | X | 0.0 | NO | |
222 | 0.0 | NO | |||
223 | Actualización 21 -10-2019 | 0.0 | NO | ||
224 | El presente cuadro presenta generalidades sobr... | 0.0 | NO |
Let's correct the country names to improve matching.
It's always a good practice to keep the original names.
visadf['PAIS_OR'] = visadf.PAIS
visadf.loc[visadf.PAIS.str.find('(')!=-1, 'PAIS'] = visadf.loc[visadf.PAIS_OR.str.find('(')!=-1, 'PAIS_OR'].apply(lambda x: x[:x.find('(')])
visadf.PAIS = visadf.PAIS.str.strip()
visadf.tail(30)
PAIS | SI | NO | visa_req | visa_req_YN | PAIS_OR | |
---|---|---|---|---|---|---|
195 | Venezuela | X | 0.0 | NO | Venezuela | |
196 | Vietnam | 1.0 | YES | Vietnam | ||
197 | Yemen | X X X | 1.0 | YES | Yemen | |
198 | Zambia | 1.0 | YES | Zambia | ||
199 | Zimbabwe | X | 1.0 | YES | Zimbabwe | |
200 | OTROS TERRITORIOS | 0.0 | NO | OTROS TERRITORIOS | ||
201 | Aruba | X | 0.0 | NO | Aruba (PaÃses Bajos) | |
202 | Bonaire | X | 0.0 | NO | Bonaire (PaÃses Bajos) | |
203 | Curazao | X | 0.0 | NO | Curazao (PaÃses Bajos) | |
204 | Guadalupe | X | 0.0 | NO | Guadalupe (Francia) | |
205 | Guyana Francesa | X | 0.0 | NO | Guyana Francesa | |
206 | Hong Kong | X Por 90 dÃas | 0.0 | NO | Hong Kong (SARG-China) | |
207 | Macao | X Visa a la llegada | 1.0 | YES | Macao (SARG-China) (*) | |
208 | Martinica | X | 0.0 | NO | Martinica (Francia) | |
209 | Mayotte | X | 0.0 | NO | Mayotte (Francia) | |
210 | Nueva Caledonia | X | 0.0 | NO | Nueva Caledonia (Francia) | |
211 | Palestina | X | 1.0 | YES | Palestina | |
212 | Polinesia Francesa | X | 0.0 | NO | Polinesia Francesa | |
213 | Réunion | X | 0.0 | NO | Réunion (Francia) | |
214 | Saba | X | 0.0 | NO | Saba (PaÃses Bajos) | |
215 | Saint Barthélémy | X | 1.0 | YES | Saint Barthélémy (Francia) | |
216 | Saint Pïerre et Miquelon | X | 0.0 | NO | Saint Pïerre et Miquelon (Francia) | |
217 | Saint Martin | X | 1.0 | YES | Saint Martin (Francia) | |
218 | Sint Maarten | X | 0.0 | NO | Sint Maarten (PaÃses Bajos) | |
219 | Sint Eustatius | X | 0.0 | NO | Sint Eustatius (PaÃses Bajos) | |
220 | Taiwan | X Visa electrónica | 1.0 | YES | Taiwan | |
221 | Wallis y Futuna | X | 0.0 | NO | Wallis y Futuna (Francia) | |
222 | 0.0 | NO | ||||
223 | Actualización 21 -10-2019 | 0.0 | NO | Actualización 21 -10-2019 | ||
224 | El presente cuadro presenta generalidades sobr... | 0.0 | NO | El presente cuadro presenta generalidades sobr... |
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChoropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
save=False)