@@ -122,16 +122,50 @@ pg_dump -d $DATABASE_NAME --no-owner -t wikipedia_article -t wikipedia_redirect
122
122
pigz -9 > " $OUTPUT_PATH /wikipedia_importance.sql.gz"
123
123
124
124
125
+ # Temporary table for sorting the output by most popular language. Nominatim assigns
126
+ # the wikipedia extra tag to the first language it finds during import and English (en)
127
+ # makes debugging easier than Arabic (ar).
128
+ # Not a temporary table actually because with each psqlcmd call we start a new
129
+ # session.
130
+ #
131
+ # language | size
132
+ # ----------+---------
133
+ # en | 3360898
134
+ # de | 989366
135
+ # fr | 955523
136
+ # uk | 920531
137
+ # sv | 918185
138
+
139
+ echo " DROP TABLE IF EXISTS top_languages;" | psqlcmd
140
+ echo " CREATE TABLE top_languages AS
141
+ SELECT language, COUNT(*) AS size
142
+ FROM wikimedia_importance
143
+ GROUP BY language
144
+ ORDER BY size DESC
145
+ ;" | psqlcmd
146
+
147
+
148
+
149
+
125
150
for TABLE in wikipedia_article wikipedia_redirect wikimedia_importance
126
151
do
127
152
echo " * $TABLE .csv.gz"
128
153
154
+ SORTCOL=" title"
155
+ if [[ " $TABLE " == " wikipedia_redirect" ]]; then
156
+ SORTCOL=" from_title"
157
+ fi
158
+
129
159
{
130
160
echo " COPY (SELECT * FROM $TABLE LIMIT 0) TO STDOUT WITH DELIMITER E'\t' CSV HEADER" | \
131
161
psqlcmd
132
- echo " COPY $TABLE TO STDOUT" | \
133
- psqlcmd | \
134
- sort
162
+ echo " COPY (
163
+ SELECT w.*
164
+ FROM $TABLE w
165
+ JOIN top_languages tl ON w.language = tl.language
166
+ ORDER BY tl.size DESC, w.$SORTCOL
167
+ ) TO STDOUT" | \
168
+ psqlcmd
135
169
} | pigz -9 > " $OUTPUT_PATH /$TABLE .csv.gz"
136
170
137
171
# default is 600
@@ -143,4 +177,4 @@ du -h $OUTPUT_PATH/*
143
177
# 220M wikipedia_article.csv.gz
144
178
# 87M wikipedia_redirect.csv.gz
145
179
# 305M wikipedia_importance.sql.gz
146
- # 87M wikimedia_importance.csv.gz
180
+ # 265M wikimedia_importance.csv.gz
0 commit comments