-
Notifications
You must be signed in to change notification settings - Fork 0
/
fannie.go
149 lines (142 loc) · 5.28 KB
/
fannie.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// Package fannie loads the standard, HARP and non-standard (excluded) loans made available by Fannie Mae
// into a ClickHouse database.
//
// The final result is a single table with nested arrays for time-varying fields.
// Key features of this package:
// - New fields created are:
// - vintage (e.g. 2010Q2)
// - standard - Y/N flag, Y=standard process loan
// - loan age based on first pay date
// - numeric dq field
// - property value at origination
// - harp - Y/N flag, Y=HARP loan.
// - file name from which the loan was loaded
// - QA results. There are three sets of fields:
// - The nested table qa that has two arrays:
// - field. The name of a field that has validation issues.
// - cntFail. The number of months for which this field failed qa. For static fields, this value will
// be 1.
// - allFail. An array of field names which failed for qa. For monthly fields, this means the field failed for all months.
// - A "DESCRIBE" of the output table provides info on each field.
//
// The command-line parameters are:
//
// -host ClickHouse IP address. Default: 127.0.0.1.
// -user ClickHouse user. Default: default
// -password ClickHouse password for user. Default: <empty>.
// -table ClickHouse table in which to insert the data.
// -maptable. Clickhouse table that maps pre-HARP loan ids to HARP ids. This table is both created and used by the package.
// -create if Y, then the table is created/reset. Default: Y.
// -dir directory with Fannie Mae text files.
// -tmp ClickHouse database to use for temporary tables.
// -concur # of concurrent processes to use in loading monthly files. Default: 1.
// -memory max memory usage by ClickHouse. Default: 40000000000.
// -groupby max_bytes_before_external_groupby ClickHouse paramter. Default: 20000000000.
//
// The non-standard loans have four additional fields. This package recognizes whether the file is standard or not.
// A combined table can be built by running the app twice pointing to the same -table.
// On the first run, set -create Y and set -create N for the second run.
//
// Note: for this package to run correctly, the standard loans should be loaded first, so that the table that
// maps HARP loans to their corresponding pre-HARP loan is loaded and available. This table isn't needed after all the
// files are loaded.
//
// See the example under package collapse for the structure of the table.
//
// The data is available at https://datadynamics.fanniemae.com/data-dynamics/#/reportMenu;category=HP.
package main
import (
"flag"
"fmt"
"github.com/ClickHouse/clickhouse-go/v2"
"github.com/invertedv/chutils"
"github.com/invertedv/fannie/collapse"
"github.com/invertedv/fannie/raw"
"log"
"os"
"sort"
"strings"
"time"
)
func main() {
var err error
host := flag.String("host", "127.0.0.1", "string")
user := flag.String("user", "default", "string")
password := flag.String("password", "", "string")
srcDir := flag.String("dir", "", "string")
create := flag.String("create", "Y", "string")
table := flag.String("table", "", "string")
mapTable := flag.String("mapTable", "", "string")
tmp := flag.String("tmp", "", "string")
nConcur := flag.Int("concur", 1, "int")
maxMemory := flag.Int64("memory", 40000000000, "int64")
maxGroupby := flag.Int64("groupby", 20000000000, "int64")
flag.Parse()
createTable := *create == "Y" || *create == "y"
if (*srcDir)[len(*srcDir)-1] != '/' {
*srcDir += "/"
}
// connect to ClickHouse
con, err := chutils.NewConnect(*host, *user, *password, clickhouse.Settings{
"max_memory_usage": *maxMemory,
"max_bytes_before_external_group_by": *maxGroupby,
})
if err != nil {
log.Fatalln(err)
}
defer func() {
if e := con.Close(); e != nil {
log.Fatalln(e)
}
}()
// holds the set of files to work through
fileList := make([]string, 0)
dir, err := os.ReadDir(*srcDir)
if err != nil {
log.Fatalln(fmt.Errorf("error reading directory: %s", *srcDir))
}
gotMap := false // if true, this directory has the mapping of pre-HARP to HARP loans
// build the file list
for _, f := range dir {
if strings.Contains(f.Name(), ".csv") && !strings.Contains(f.Name(), "Loan") {
fileList = append(fileList, f.Name())
}
if f.Name() == "Loan_Mapping.txt" {
gotMap = true
}
}
if len(fileList) == 0 {
log.Fatalln(fmt.Errorf("%s", "directory has no .csv files"))
}
if gotMap {
if e := raw.LoadHarpMap(*srcDir+"Loan_Mapping.txt", *mapTable, con); e != nil {
log.Fatalln(e)
}
}
sort.Strings(fileList)
step1Time := 0.0
step2Time := 0.0
for ind, fileName := range fileList {
fullFile := *srcDir + fileName
tmpTable := *tmp + ".source"
s := time.Now()
if e := raw.LoadRaw(fullFile, tmpTable, true, *nConcur, con); e != nil {
log.Fatalln(e)
}
step1 := time.Since(s).Minutes()
s = time.Now()
if e := collapse.GroupBy("tmp.source", *table, *mapTable, createTable, con); e != nil {
log.Fatalln(e)
}
step2 := time.Since(s).Minutes()
createTable = false
fmt.Printf("Done with %s. %d out of %d ,times: %0.2f, %0.2f minutes\n", fileName, ind+1, len(fileList), step1, step2)
step1Time += step1
step2Time += step2
}
step1Time /= 60.0
step2Time /= 60.0
fmt.Printf("step1 time: %0.2f step2 time: %0.2f hours, total: %0.2f\n", step1Time, step2Time, step1Time+step2Time)
// clean up
_, _ = con.Exec(fmt.Sprintf("DROP TABLE %s.source", *tmp))
}