-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduplicate_finder.js
113 lines (81 loc) · 2.46 KB
/
duplicate_finder.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
const http = require('http');
const _ = require('lodash');
const Knex = require('knex');
const api = require('./api');
// connect to database
const knexConfig = {
client: 'sqlite3',
connection: {
filename: "./irish-writers.sqlite"
},
useNullAsDefault: true,
databaseConnection: {
pool: {
afterCreate: (conn, cb) => {
conn.run('PRAGMA foreign_keys = ON', cb);
}
}
}
};
const knex = Knex(knexConfig);
const findDuplicatePeople = async () => {
const allPeople = (await api.getAllPeople(knex, {})).data;
const alreadyFound = new Map();
for (let i=0; i<allPeople.length; i++) {
const id = allPeople[i].DB_id;
const personData = await api.getSinglePerson(knex, id);
//console.log(id, i, personData.core);
const noId = _.omit(personData.core, 'DB_id');
const stringified = JSON.stringify(noId);
const duplicate = alreadyFound.has(stringified);
if (!duplicate) {
alreadyFound.set(stringified, id);
} else {
await knex('col_Authors')
.where('person_id', '=', id)
.update({
'person_id': alreadyFound.get(stringified)
});
await knex('col_Authors')
.del()
.where('person_id', '=', id)
await knex('people')
.del()
.where('DB_id', '=', id)
console.log('found duplicate', noId['lastname_keyname'], id, alreadyFound.get(stringified));
}
}
console.log("People: ", allPeople.length, alreadyFound.size)
};
const findDuplicatePublications = async () => {
const allPublications = (await api.getAllPublications(knex, {})).data;
const alreadyFound = new Map();
for (let i=0; i<allPublications.length; i++) {
const id = allPublications[i].DB_id;
const personData = await api.getSinglePublication(knex, id);
//console.log(id, i, personData.core);
const noId = _.omit(personData.core, 'DB_id');
const stringified = JSON.stringify(noId);
const duplicate = alreadyFound.has(stringified);
if (!duplicate) {
alreadyFound.set(stringified, id);
} else {
await knex('col_Authors')
.where('publication_id', '=', id)
.update({
'publication_id': alreadyFound.get(stringified)
});
await knex('col_Authors')
.del()
.where('publication_id', '=', id)
await knex('pub_Publications')
.del()
.where('DB_id', '=', id)
console.log('found duplicate', noId['title'], id, alreadyFound.get(stringified));
}
}
console.log("Publications: ", allPublications.length, alreadyFound.size)
};
findDuplicatePeople()
.then(findDuplicatePublications)
.then(() => process.exit(0))