Skip to content

Commit e13b39c

Browse files
authored
Add linktarget table (#84)
* Also download and process new linktarget table
1 parent d758ddf commit e13b39c

11 files changed

+100
-42
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ in the results match the search terms).
2121

2222
Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.
2323

24-
To run one build you need 150GB of disc space (of which 90GB is Postgresql database). The scripts process
25-
39 languages and output one file. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with SSD
26-
discs.
24+
To run one build you need 150GB of disc space (of which 90GB Postgresql database). The scripts process
25+
39 languages and output 4 files. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with NVMe
26+
drives.
2727

2828
```
2929
334M wikimedia_importance.tsv.gz

bin/filter_page.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
77
# `page_namespace` int(11) NOT NULL DEFAULT 0,
88
# `page_title` varbinary(255) NOT NULL DEFAULT '',
9-
# `page_restrictions` tinyblob DEFAULT NULL,
109
# `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
1110
# `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
1211
# `page_random` double unsigned NOT NULL DEFAULT 0,

bin/filter_pagelinks.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,28 @@
33
'''
44
Input from STDIN
55
# CREATE TABLE `pagelinks` (
6-
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
7-
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
8-
# `pl_title` varbinary(255) NOT NULL DEFAULT '',
9-
# `pl_from_namespace` int(11) NOT NULL DEFAULT 0,
6+
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
7+
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
8+
# `pl_target_id` bigint(20) unsigned NOT NULL,
109
1110
Output to STDOUT: pl_title, count
1211
'''
1312

1413
import sys
1514
import csv
15+
import gzip
16+
17+
if len(sys.argv) < 2:
18+
print("Usage: filter_pagelinks.py linktarget.csv.gz")
19+
exit(1)
20+
21+
linktarget_filename = sys.argv[1]
22+
linktarget_id_to_title = dict()
23+
24+
with gzip.open(linktarget_filename, 'rt') as gzfile:
25+
reader = csv.reader(gzfile)
26+
for row in reader:
27+
linktarget_id_to_title[row[0]] = row[1]
1628

1729
reader = csv.reader(sys.stdin)
1830
writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
@@ -23,8 +35,8 @@
2335
if (row[1] != '0'):
2436
continue
2537

26-
title = row[2].replace('\r', '')
27-
if len(title) == 0:
38+
title = linktarget_id_to_title.get(row[2])
39+
if title is None:
2840
continue
2941

3042
if title not in counts:

bin/filter_redirect.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@
1010
# `rd_fragment` varbinary(255) DEFAULT NULL,
1111
1212
Output to STDOUT: rd_from_page_id, rd_title
13+
14+
Same for linktarget table
15+
# CREATE TABLE `linktarget` (
16+
# `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
17+
# `lt_namespace` int(11) NOT NULL,
18+
# `lt_title` varbinary(255) NOT NULL,
1319
'''
1420

1521
import sys

steps/latest_available_data.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ check_all_files_ready() {
6161
## usually the last to be dumped
6262
##
6363
# from wikipedia_download.sh
64-
WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks redirect"
64+
WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks linktarget redirect"
6565
DUMP_RUN_INFO_URL="https://mirror.clarkson.edu/wikimedia/zhwiki/$CHECK_DATE/dumpruninfo.json"
6666
debug $DUMP_RUN_INFO_URL
6767
DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL")

steps/wikipedia_download.sh

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,19 +42,21 @@ do
4242
mkdir -p "$DOWNLOADED_PATH/$LANG"
4343

4444
# English is the largest
45-
# 1.7G downloaded/en/page.sql.gz
46-
# 6.2G downloaded/en/pagelinks.sql.gz
47-
# 355M downloaded/en/langlinks.sql.gz
48-
# 128M downloaded/en/redirect.sql.gz
45+
# 2.1G downloaded/en/page.sql.gz
46+
# 6.4G downloaded/en/pagelinks.sql.gz
47+
# 492M downloaded/en/langlinks.sql.gz
48+
# 992M downloaded/en/linktarget.sql.gz
49+
# 160M downloaded/en/redirect.sql.gz
4950

5051
# Smaller language Turkish
51-
# 53M downloaded/tr/page.sql.gz
52-
# 176M downloaded/tr/pagelinks.sql.gz
53-
# 106M downloaded/tr/langlinks.sql.gz
54-
# 3.2M downloaded/tr/redirect.sql.gz
52+
# 90M downloaded/tr/page.sql.gz
53+
# 255M downloaded/tr/pagelinks.sql.gz
54+
# 166M downloaded/tr/langlinks.sql.gz
55+
# 62M downloaded/tr/linktarget.sql.gz
56+
# 4.2M downloaded/tr/redirect.sql.gz
5557

5658

57-
for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz redirect.sql.gz; do
59+
for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz linktarget.sql.gz redirect.sql.gz; do
5860

5961
download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/${LANG}wiki-$WIKIPEDIA_DATE-$FN "$DOWNLOADED_PATH/$LANG/$FN"
6062
download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/md5sums-${LANG}wiki-$WIKIPEDIA_DATE-$FN.txt "$DOWNLOADED_PATH/$LANG/$FN.md5"

steps/wikipedia_sql2csv.sh

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@ do
1717
mkdir -p "$CONVERTED_PATH/$LANG/"
1818

1919
echo "[language $LANG] Page table SQL => CSV"
20+
# https://www.mediawiki.org/wiki/Manual:Page_table
21+
#
2022
# CREATE TABLE `page` (
2123
# `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
2224
# `page_namespace` int(11) NOT NULL DEFAULT 0,
2325
# `page_title` varbinary(255) NOT NULL DEFAULT '',
24-
# `page_restrictions` tinyblob DEFAULT NULL,
2526
# `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
2627
# `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
2728
# `page_random` double unsigned NOT NULL DEFAULT 0,
@@ -44,26 +45,50 @@ do
4445
pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz
4546

4647

48+
echo "[language $LANG] linktarget table SQL => CSV"
49+
# https://www.mediawiki.org/wiki/Manual:Linktarget_table
50+
#
51+
# CREATE TABLE `linktarget` (
52+
# `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
53+
# `lt_namespace` int(11) NOT NULL,
54+
# `lt_title` varbinary(255) NOT NULL,
55+
#
56+
# Only interested in lt_namespace == 0 (articles)
57+
# English wikipedia:
58+
# input 964MB compressed (100m rows)
59+
# output 322MB compressed (30m rows)
60+
# Output columns: lt_id, lt_title
61+
62+
unpigz -c $DOWNLOADED_PATH/${LANG}/linktarget.sql.gz | \
63+
bin/mysqldump_to_csv.py | \
64+
bin/filter_redirect.py | \
65+
pigz -9 > $CONVERTED_PATH/$LANG/linktarget.csv.gz
66+
67+
68+
4769
echo "[language $LANG] Pagelinks table SQL => CSV"
70+
# https://www.mediawiki.org/wiki/Manual:Pagelinks_table
71+
#
4872
# CREATE TABLE `pagelinks` (
49-
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
50-
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
51-
# `pl_title` varbinary(255) NOT NULL DEFAULT '',
52-
# `pl_from_namespace` int(11) NOT NULL DEFAULT 0,
73+
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
74+
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
75+
# `pl_target_id` bigint(20) unsigned NOT NULL,
5376
#
54-
# Only interested in pl_namespace == 0 (articles)
77+
# Only interested in target_ids that point to == 0 (articles)
5578
# English wikipedia:
5679
# input 6.8GB compressed
5780
# output 200MB compressed
58-
# Output columns: pl_title, count
81+
# Output columns: lt_title (from linktarget file), count (unique pl_from)
5982

6083
unpigz -c $DOWNLOADED_PATH/$LANG/pagelinks.sql.gz | \
6184
bin/mysqldump_to_csv.py | \
62-
bin/filter_pagelinks.py | \
85+
bin/filter_pagelinks.py $CONVERTED_PATH/$LANG/linktarget.csv.gz | \
6386
pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz
6487

6588

6689
echo "[language $LANG] langlinks table SQL => CSV"
90+
# https://www.mediawiki.org/wiki/Manual:Langlinks_table
91+
#
6792
# CREATE TABLE `langlinks` (
6893
# `ll_from` int(8) unsigned NOT NULL DEFAULT 0,
6994
# `ll_lang` varbinary(35) NOT NULL DEFAULT '',
@@ -81,7 +106,11 @@ do
81106
pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz
82107

83108

109+
110+
84111
echo "[language $LANG] redirect table SQL => CSV"
112+
# https://www.mediawiki.org/wiki/Manual:Redirect_table
113+
#
85114
# CREATE TABLE `redirect` (
86115
# `rd_from` int(8) unsigned NOT NULL DEFAULT 0,
87116
# `rd_namespace` int(11) NOT NULL DEFAULT 0,

tests/filter_pagelinks.test1.txt

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
enwiki,0,a,0
2-
enwiki,0,a,0
3-
enwiki,0,a,0
4-
enwiki,0,b,0
5-
enwiki,0,b,0
6-
enwiki,0,"title,with,comma",0
7-
enwiki,0,a,0
8-
enwiki,0,a,0
9-
enwiki,0,d,0
1+
enwiki,0,11
2+
enwiki,0,11
3+
enwiki,0,11
4+
enwiki,0,22
5+
enwiki,0,22
6+
enwiki,0,33
7+
enwiki,0,11
8+
enwiki,0,11
9+
enwiki,0,44
10+
enwiki,1,44
11+
enwiki,0,55
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
a,5
2-
b,2
3-
"title,with,comma",1
4-
d,1
1+
title1,5
2+
title2,2
3+
"title3,with,comma",1
4+
title4,1

tests/linktargets.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
11,title1
2+
22,title2
3+
33,"title3,with,comma"
4+
44,title4

0 commit comments

Comments
 (0)