Skip to content

Commit 7e04a45

Browse files
authored
Largest language in output first (#81)
* output: largest language (english) first in CSV files
1 parent f798ce5 commit 7e04a45

File tree

1 file changed

+38
-4
lines changed

1 file changed

+38
-4
lines changed

steps/output.sh

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,16 +122,50 @@ pg_dump -d $DATABASE_NAME --no-owner -t wikipedia_article -t wikipedia_redirect
122122
pigz -9 > "$OUTPUT_PATH/wikipedia_importance.sql.gz"
123123

124124

125+
# Temporary table for sorting the output by most popular language. Nominatim assigns
126+
# the wikipedia extra tag to the first language it finds during import and English (en)
127+
# makes debugging easier than Arabic (ar).
128+
# Not a temporary table actually because with each psqlcmd call we start a new
129+
# session.
130+
#
131+
# language | size
132+
# ----------+---------
133+
# en | 3360898
134+
# de | 989366
135+
# fr | 955523
136+
# uk | 920531
137+
# sv | 918185
138+
139+
echo "DROP TABLE IF EXISTS top_languages;" | psqlcmd
140+
echo "CREATE TABLE top_languages AS
141+
SELECT language, COUNT(*) AS size
142+
FROM wikimedia_importance
143+
GROUP BY language
144+
ORDER BY size DESC
145+
;" | psqlcmd
146+
147+
148+
149+
125150
for TABLE in wikipedia_article wikipedia_redirect wikimedia_importance
126151
do
127152
echo "* $TABLE.csv.gz"
128153

154+
SORTCOL="title"
155+
if [[ "$TABLE" == "wikipedia_redirect" ]]; then
156+
SORTCOL="from_title"
157+
fi
158+
129159
{
130160
echo "COPY (SELECT * FROM $TABLE LIMIT 0) TO STDOUT WITH DELIMITER E'\t' CSV HEADER" | \
131161
psqlcmd
132-
echo "COPY $TABLE TO STDOUT" | \
133-
psqlcmd | \
134-
sort
162+
echo "COPY (
163+
SELECT w.*
164+
FROM $TABLE w
165+
JOIN top_languages tl ON w.language = tl.language
166+
ORDER BY tl.size DESC, w.$SORTCOL
167+
) TO STDOUT" | \
168+
psqlcmd
135169
} | pigz -9 > "$OUTPUT_PATH/$TABLE.csv.gz"
136170

137171
# default is 600
@@ -143,4 +177,4 @@ du -h $OUTPUT_PATH/*
143177
# 220M wikipedia_article.csv.gz
144178
# 87M wikipedia_redirect.csv.gz
145179
# 305M wikipedia_importance.sql.gz
146-
# 87M wikimedia_importance.csv.gz
180+
# 265M wikimedia_importance.csv.gz

0 commit comments

Comments
 (0)