-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPerimatch_Wrangle.py
152 lines (140 loc) · 5.1 KB
/
Perimatch_Wrangle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
'''Perimatch wrangle
To wrangle complicated files with year of graduation proceeding several names
Names are of format:
Last, First Degree(s) (ProgramType)", "New Job, Institution, City, State"
'''
import pandas as pd
import numpy as np
#from names import split_name
def addyear (frame, years = (2000,2015), col="Col1"):
'''*DataFrame*, *years=2000-2015* , *column to look at dates = 1*
takes an input dataframe and adds a 'GRAD_YEAR' column
then searches down the named column for something that looks year-like,
copies that number to the year column
then fills down'''
GRAD_YEAR = []
year = 0
for data in frame[col]:
try:
data = int(data)
year = data
GRAD_YEAR.append(None)
except ValueError:
GRAD_YEAR.append(year)
frame['GRAD_YEAR'] = GRAD_YEAR
return frame
def addphd (frame, col="Col1", PhD_col = 'PhD'):
'''*DataFrame*, *column = 'Col1'* searches the column for anything
'phd'-like and then puts either a 'true' or 'false' in a new column called
PhD_col = 'PhD' '''
hasphd = []
for data in frame[col]:
data = str(data)
data = data.upper()
phd = data.count('PHD')
md = data.count('MD')
if phd == 1:
hasphd.append(True)
elif phd == 0 and md == 1:
hasphd.append(False)
else:
hasphd.append(None)
frame['PhD'] = hasphd
return frame
def dropMD (frame, col = 'Col1'):
'''*DataFrame*, *column = 'Col1'* returns whatever string is before
', MD' in the string in column'''
names = []
for data in frame[col]:
try:
data = str(data)
data2 = data.split(', MD')
name = data2[0]
except IndexError:
name = None
names.append(name)
frame[col] = names
return frame
def flip_name(name):
''' takes a string in the format of Firstname Lastname and returns a string of Lastname, Firstname, uses the names function
from billy to make a tuple and then rearranges it.'''
namebits = name.split()
last = namebits[-1]
rest = namebits[0:-1]
name = last + ','
for bit in rest:
name = name + ' ' + bit
return name
def flip_keep(frame, col = 'Col1'):
names = []
for data in frame[col]:
data = str(data)
comma = data.count(',')
if comma == 1:
names.append(data)
elif comma == 0:
data = flip_name(data)
names.append(data)
else:
names.append(data)
frame['Name'] = names
return frame
def YNTF (frame, col="Col1", newcol = ''):
''' 'Yes/No/True/False' - *frame*, *col* = 'Col1', *newcol* = ''
converts a column of yesses and noes to True and False based
on looking for y or n. If a string is passed as newcol, the
T/F will be put into a new column named newcol, if not, it
replaces the original'''
TF = []
for data in frame[col]:
data = str(data)
data = data.upper()
if data.count('Y') == 1:
TF.append(True)
elif data.count('N') == 1:
TF.append(False)
else:
TF.append(None)
if newcol == '':
frame[col] = TF
else:
frame[newcol] = TF
return frame
def fix_specialty (frame, dictionary, col="Specialty", newcol = ''):
''' *frame*, *dictionary*, *col* = 'Specialty', *newcol* = ''
- converts a column of specialties variably written to a uniform
set by applying a dictionary
to True and False based on looking for y or n
If a string is passed as newcol, the T/F will be put
into a new column named newcol, if not, it
replaces the original'''
specialties = frame[col]
short_spec = []
for spec in specialties:
sspec = spec[0:4]
sspec = sspec.title()
short_spec.append(sspec)
frame['sspec'] = short_spec
if newcol == '':
frame[col] = frame.sspec.map(dictionary)
else:
frame[newcol] = frame.sspec.map(dictionary)
frame = frame.drop('sspec',1)
return frame
Residents_path = 'C:\Users\JAG\RePORTER\Residencies\perimatch.csv'
Fixed_path = 'C:\Users\JAG\RePORTER\Residencies\perimatch-fixb.csv'
specialties = {'Alle':'Allergy and Immunology', 'Card' : 'Cardiology',
'Chie' : 'Chief Resident', 'Endo' : 'Endocrinology', 'Gast' : 'Gastroenterology',
'Gene' : 'General Medicine', 'Geri' : 'Geriatrician', 'Grou' : 'Private Practice',
'Hema' : 'Hematology and Oncology', 'Hosp' : 'Hospital Medicine',
'Infe' : 'Infectious Disease', 'Neph' : 'Nephrology', 'Onco' : 'Hematology and Oncology',
'Priv' : 'Private Practice', 'Pulm' : 'Pulmonary and Critical Care', 'Rheu' : 'Rheumatology'}
Residents = pd.read_csv(Residents_path, index_col=False, header=0)
Residents = Residents.drop('Unnamed: 0' , 1)
Residents = Residents.drop('ID number',1)
Residents = addphd(Residents, col='Name')
Residents = dropMD(Residents, col='Name')
Residents = flip_keep(Residents, col = 'Name')
Residents = YNTF(Residents, col = 'Traditional fellowship')
Residents = fix_specialty(Residents, specialties, newcol = 'Fix_Spec')
Residents.to_csv(Fixed_path)