-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate-topic-model.sh
executable file
·77 lines (72 loc) · 2.34 KB
/
update-topic-model.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#
# usage:
# update-topic-model.sh ${1}
# where ${1} is the topic model's corpus shortname (e.g. frus)
#
# description:
# Updates a corpus specific topic model in the FOIArchive database.
#
# assumptions:
# - the script is invoked from the directory where it is installed
# - the topic model csv files have been loadded in the data subdirectory
# - that Postgres tools (min: psql, pg_dump) are installed and in the PATH
# - that the DBCONNECT environment variable is set to the db connect string
# - zip is installed locally
# - aws cli has been installed and configured
# Validate arguments and environmental variables
if [ -z "$1" ]; then
echo "Error: No corpus shortname provided."
echo "Usage: $0 <corpus_shortname>"
exit 1
fi
if [ -z "${DBCONNECT}" ]; then
echo "Error: DBCONNECT environment variable must be defined."
exit 1
fi
CORPUS=${1}
DATA_DIR=./data
heading () {
echo
echo "======================================================================"
echo $1
echo "======================================================================"
}
load_stage ()
{
table_name="foiarchive.${1}_stage"
csv_file="${DATA_DIR}/${CORPUS}_update_${1}.csv"
if [ ! -f "$csv_file" ]; then
echo "Error: File ${csv_file} not found."
exit 1
fi
psql -X -e ${DBCONNECT} <<!EOF!
\copy ${table_name} from '${csv_file}' delimiter ',' csv header
!EOF!
}
# Main script body
# long term retention of this backup is not needed
heading "Back up topics & topic_docs tables in case of error:"
pg_dump -t foiarchive.topics -t foiarchive.topic_docs -f ${DATA_DIR}/topic-tables.sql -v ${DBCONNECT}
#
heading "Drop and recreate the staging tables:"
psql -X -e ${DBCONNECT} -f create-stage-tables.sql
#
heading "Load CSVs into staging tables:"
load_stage topics
load_stage topic_doc
# Update the production tables from the stage tables
heading "Replace the current topic model from ${CORPUS} with the new one:"
psql -X -e ${DBCONNECT} -v corpus=${CORPUS} -f update-from-stage.sql
#
heading "Zipping the csv files from ./data"
DATE=`date +"%m-%d-%y"`
zip "${1}_stm_${DATE}.zip" data/*.csv
#
heading "Copying zip file to aws"
aws s3 cp "${1}_stm_${DATE}.zip" \
s3://history-lab-research-data/STM-2024/${1}_stm_${DATE}.zip
#
heading "Removing csv files from ./data"
rm data/*.csv
#
heading "Script complete."