forked from OPEN-NEXT/LOSH-RDF-DB-tester
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fill-db
executable file
·152 lines (133 loc) · 3.35 KB
/
fill-db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env bash
# SPDX-FileCopyrightText: 2021 - 2023 Robin Vobruba <[email protected]>
#
# SPDX-License-Identifier: Unlicense
# See the output of "$0 -h" for details.
# Exit immediately on each error and unset variable;
# see: https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
set -Eeuo pipefail
#set -Eeu
script_dir=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")
# shellcheck source=./_common
source "$script_dir/_common"
# initial default values
APP_NAME="RDF DB Filler"
CLONE_URL_LOSH_ONTOLOGY="https://github.com/OPEN-NEXT/OKH-LOSH.git"
CLONE_URL_LOSH_DATA="https://gitlab.opensourceecology.de/verein/projekte/losh-rdf.git"
cleanup=false
online=true
function print_help() {
script_name="$(basename "$0")"
echo "$APP_NAME - Fills an Apache Jena RDF DB with data from RDF/Turtle files."
echo
echo "Usage:"
echo " $script_name [OPTION...]"
echo "Options:"
echo " -h, --help Print this usage help and exit"
echo " --offline Do not try to fetch git repos"
echo "Examples:"
echo " $script_name"
}
# read command-line args
POSITIONAL=()
while [[ $# -gt 0 ]]
do
arg="$1"
shift # $2 -> $1, $3 -> $2, ...
case "$arg" in
-h|--help)
print_help
exit 0
;;
-o|--offline)
online=false
;;
*) # non-/unknown option
POSITIONAL+=("$arg") # save it in an array for later
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
function repo_ensure_latest() {
dir="$1"
clone_url="$2"
main_branch="${3:-master}"
if [ -d "$dir" ]
then
if $online
then
git -C "$dir" fetch
git -C "$dir" rebase "origin/$main_branch"
fi
else
if ! $online
then
>&2 echo "ERROR: --offline chosen and repo '$dir' not present!"
exit 1
fi
git clone "$clone_url" "$dir"
fi
}
ensure_java_version
ensure_tool "$JENA_HOME" "$JENA_DL_URL" "Apache Jena"
echo
echo "Get/Update input RDF ..."
mkdir -p "data"
ont_dir_name="$(basename --suffix ".git" "$CLONE_URL_LOSH_ONTOLOGY")"
ont_dir="data/$ont_dir_name"
repo_ensure_latest "$ont_dir" "$CLONE_URL_LOSH_ONTOLOGY"
ontology_ttls_root="$ont_dir"
data_dir_name="$(basename --suffix ".git" "$CLONE_URL_LOSH_DATA")"
data_dir="data/$data_dir_name"
repo_ensure_latest "$data_dir" "$CLONE_URL_LOSH_DATA" "main"
data_ttls_root="$data_dir/RDF"
if $cleanup
then
echo
echo "Checking file-names ..."
find \
"$data_ttls_root" -type f -name "*.ttl" \
| while read -r file
do
no_spaces_file="$(echo "$file" | sed -e 's/[[:space:]]/_/g')"
if [ "$no_spaces_file" != "$file" ]
then
>&2 echo "ERROR: File-name with spaces detected, please rename: '$file'"
exit 2
fi
done
echo
echo "Fixing the data from the Krawler ..."
find \
"$data_ttls_root" -type f -name "*.ttl" \
-name "*.ttl" \
| while read -r ttl_file
do
echo " '$ttl_file' ..."
sed -i \
-e 's|^@base\(.*\)#> .$|@base\1> .\n@prefix : <#> .|' \
"$ttl_file"
done
fi
echo
echo "Setting up the DB in '$db_dir' ..."
mkdir -p "$db_dir"
find \
"$ontology_ttls_root" \
"$data_ttls_root" \
-name "*.ttl" \
-print0 \
| xargs -0 "$jena_db_data_injector" \
--loc "$db_dir" \
--syntax turtle
echo "done. (Setting up the DB)"
echo
echo "Ready to run SPARQL querries!"
echo
echo "for example:"
echo "$jena_db_data_querier \\"
echo " --loc \"$db_dir\" \\"
echo " --query \"sample-query.txt\""
echo
echo "Or start the Web interface for running SPARQL queries with:"
echo "./web-ui"