-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_html_gdp_example.py
52 lines (43 loc) · 1.51 KB
/
read_html_gdp_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import numpy as np
from unicodedata import normalize
import matplotlib.pyplot as plt
def clean_normalize_whitespace(x):
""" Normalize unicode characters and strip trailing spaces
"""
if isinstance(x, str):
return normalize('NFKC', x).strip()
else:
return x
# Read in the Wikipedia page and get the DataFrame
table_GDP = pd.read_html(
'https://en.wikipedia.org/wiki/Economy_of_the_United_States',
match='Nominal GDP')
df_GDP = table_GDP[0]
# Clean up the DataFrame and Columns
df_GDP = df_GDP.applymap(clean_normalize_whitespace)
df_GDP.columns = df_GDP.columns.to_series().apply(clean_normalize_whitespace)
# Determine numeric types for each column
# Adjust original for changes in column names and footnote numbers
col_type = {
'Year': 'int',
'Nominal GDP(billions USD)': 'float',
'GDP per capita(USD)': 'int',
'GDP growth(real)': 'float',
'Inflation rate(in %)': 'float',
'Unemployment(in %)': 'float',
'Budget balance(in % of GDP)[105]': 'float',
'Government debt held by public(in % of GDP)[106]': 'float',
'Current accountbalance(in % of GDP)': 'float'
}
# Values to replace
clean_dict = {'%': '', '−': '-', '\(est\)': ''}
# Replace values and convert to numeric values
df_GDP = df_GDP.replace(clean_dict, regex=True).replace({
'-n/a ': np.nan
}).astype(col_type)
df_GDP.info()
print(df_GDP)
plt.style.use('seaborn-whitegrid')
df_GDP.plot.line(x='Year', y=['Budget balance(in % of GDP)[105]', 'Unemployment(in %)'])
plt.show()