-
Notifications
You must be signed in to change notification settings - Fork 0
/
B02 avg_familiarity_by_combination.py
140 lines (105 loc) · 5.07 KB
/
B02 avg_familiarity_by_combination.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sqlalchemy as db
import time
import numpy as np
import sys
import pandas as pd
#import tables
def main(connect_string):
# connection to database
sql_engine = db.create_engine(connect_string)
# input of othere params: name of table where data are stored, file to write to
infile = input("Input file [fam_by_pat_class_comb.csv]?") or 'fam_by_pat_class_comb.csv'
tabinname = input("Companies table [patassg]?") or 'patassg'
outfile1 = input("Output file [avg_by_pat.csv]?") or 'avg_by_pat.csv'
outfile2 = input("Output file [avg_by_year.csv]?") or 'avg_by_year.csv'
tab_in = config['database'] + '.' + tabinname
print ('Prog start ', time.ctime())
# tables read
sql_in = 'Select distinct a.patent, c.pdpco, c.gvkey, p.appyear From '
sql_in += 'nber.patassg a Inner Join nber.dynass b On b.pdpass = a.pdpass Inner Join '
sql_in += 'nber.pdpcohdr c On c.pdpco = b.pdpco1 Inner Join nber.pat76_06_assg p On a.patent = p.patent'
# tables read in a dataframe df_fam
df_fam= pd.read_csv(infile) #,patent,ipc4comb,familiarity,cum_fam
# drop fam 0 values
df_fam=df_fam[df_fam['familiarity']>0]
# new dataframe with means calculations
# 1a average familiarity by class combination
df_avg_fam=df_fam.groupby('patent').familiarity.mean().reset_index()
df_avg_fam.rename( columns={"familiarity": "avg_familiarity"}, inplace=True)
# output to csv file
with open('combfam_'+outfile1, 'w') as f:
df_avg_fam.to_csv(f, header=True)
print('Step 1a completed ', time.ctime())
f.close
# 1b cumulate familiarity by class combination
df_avg_cumfam=df_fam.groupby('patent').cum_fam.mean().reset_index()
df_avg_cumfam.rename( columns={"cum_fam": "avg_cum_fam"}, inplace=True)
# output to csv file
with open('cumfam_'+outfile1, 'w') as f:
df_avg_cumfam.to_csv(f, header=True)
print('Step 1b completed ', time.ctime())
f.close
# step 2
# reads companies / patent : a.patent, c.pdpco, c.gvkey, p.appyear
df_pdpco = pd.read_sql_query(sql_in, sql_engine)
# 2b join company and patent data
df_pdpco_fam =pd.merge(df_pdpco, df_avg_fam, how='inner', on=['patent'])
# 2c average by pdpco
df_fam_pdpco_year = df_pdpco_fam.groupby(['pdpco','appyear']).avg_familiarity.mean().reset_index()
df_fam_pdpco_year['avg_familiarity_sqrt']= df_fam_pdpco_year['avg_familiarity'].apply(np.sqrt)
# output to csv file
with open('combfam_pdpco_'+outfile2, 'w') as f:
df_fam_pdpco_year.to_csv(f, header=True)
print('Step 2c completed ', time.ctime())
f.close
# 2d average by pdpco
df_fam_gvkey_year = df_pdpco_fam.groupby(['gvkey','appyear']).avg_familiarity.mean().reset_index()
df_fam_gvkey_year['avg_familiarity_sqrt'] = df_fam_gvkey_year['avg_familiarity'].apply(np.sqrt)
# output to csv file
with open('combfam_gvkey_'+outfile2, 'w') as f:
df_fam_gvkey_year.to_csv(f, header=True)
print('Step 2d completed ', time.ctime())
f.close
# 2e join company and patent data
df_pdpco_cum =pd.merge(df_pdpco, df_avg_cumfam, how='inner', on=['patent'])
# 2f average cumulate by pdpco
df_cum_pdpco_year = df_pdpco_cum.groupby(['pdpco', 'appyear']).avg_cum_fam.mean().reset_index()
df_cum_pdpco_year['avg_cum_fam_sqrt'] = df_cum_pdpco_year['avg_cum_fam'].apply(np.sqrt)
# output to csv file
with open('cumfam_pdpco_' + outfile2, 'w') as f:
df_cum_pdpco_year.to_csv(f, header=True)
print('Step 2f completed ', time.ctime())
f.close
# 2g average cumulate by pdpco
df_cum_gvkey_year = df_pdpco_cum.groupby(['gvkey', 'appyear']).avg_cum_fam.mean().reset_index()
df_cum_gvkey_year['avg_cum_fam_sqrt'] = df_cum_gvkey_year['avg_cum_fam'].apply(np.sqrt)
# output to csv file
with open('cumfam_gvkey_' + outfile2, 'w') as f:
df_cum_gvkey_year.to_csv(f, header=True)
print('Step 2g completed ', time.ctime())
f.close
# Standard boilerplate to call the main() function.
if __name__ == '__main__':
# config dictionary contains connection parameters
config = {
'host': 'localhost',
'port': '3306',
'database': 'test',
'user': 'root',
'password': ' ',
'charset': 'utf8',
'use_unicode': True,
'get_warnings': True,
}
# input of default password for root and library
print ('B02 Index calc v1.0 201905 Py36')
config['password'] = input("Mysql password [mysql]?") or 'mysql'
config['database'] = input("Mysql library [nber]?") or 'nber'
# connect param: 'mysql+pymysql://USER:PW@DBHOST/DB'
connect_string = 'mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.format(config['user'],config['password'],config['host'], config['port'] ,config['database'])
print (connect_string)
# call of main procedure
out= main(connect_string)
print ("end time", time.ctime())