|
17 | 17 | mkdir -p "$CONVERTED_PATH/$LANG/"
|
18 | 18 |
|
19 | 19 | echo "[language $LANG] Page table SQL => CSV"
|
| 20 | + # https://www.mediawiki.org/wiki/Manual:Page_table |
| 21 | + # |
20 | 22 | # CREATE TABLE `page` (
|
21 | 23 | # `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
|
22 | 24 | # `page_namespace` int(11) NOT NULL DEFAULT 0,
|
23 | 25 | # `page_title` varbinary(255) NOT NULL DEFAULT '',
|
24 |
| - # `page_restrictions` tinyblob DEFAULT NULL, |
25 | 26 | # `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
|
26 | 27 | # `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
|
27 | 28 | # `page_random` double unsigned NOT NULL DEFAULT 0,
|
|
44 | 45 | pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz
|
45 | 46 |
|
46 | 47 |
|
| 48 | + echo "[language $LANG] linktarget table SQL => CSV" |
| 49 | + # https://www.mediawiki.org/wiki/Manual:Linktarget_table |
| 50 | + # |
| 51 | + # CREATE TABLE `linktarget` ( |
| 52 | + # `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, |
| 53 | + # `lt_namespace` int(11) NOT NULL, |
| 54 | + # `lt_title` varbinary(255) NOT NULL, |
| 55 | + # |
| 56 | + # Only interested in lt_namespace == 0 (articles) |
| 57 | + # English wikipedia: |
| 58 | + # input 964MB compressed (100m rows) |
| 59 | + # output 322MB compressed (30m rows) |
| 60 | + # Output columns: lt_id, lt_title |
| 61 | + |
| 62 | + unpigz -c $DOWNLOADED_PATH/${LANG}/linktarget.sql.gz | \ |
| 63 | + bin/mysqldump_to_csv.py | \ |
| 64 | + bin/filter_redirect.py | \ |
| 65 | + pigz -9 > $CONVERTED_PATH/$LANG/linktarget.csv.gz |
| 66 | + |
| 67 | + |
| 68 | + |
47 | 69 | echo "[language $LANG] Pagelinks table SQL => CSV"
|
| 70 | + # https://www.mediawiki.org/wiki/Manual:Pagelinks_table |
| 71 | + # |
48 | 72 | # CREATE TABLE `pagelinks` (
|
49 |
| - # `pl_from` int(8) unsigned NOT NULL DEFAULT 0, |
50 |
| - # `pl_namespace` int(11) NOT NULL DEFAULT 0, |
51 |
| - # `pl_title` varbinary(255) NOT NULL DEFAULT '', |
52 |
| - # `pl_from_namespace` int(11) NOT NULL DEFAULT 0, |
| 73 | + # `pl_from` int(8) unsigned NOT NULL DEFAULT 0, |
| 74 | + # `pl_namespace` int(11) NOT NULL DEFAULT 0, |
| 75 | + # `pl_target_id` bigint(20) unsigned NOT NULL, |
53 | 76 | #
|
54 |
| - # Only interested in pl_namespace == 0 (articles) |
| 77 | + # Only interested in target_ids that point to == 0 (articles) |
55 | 78 | # English wikipedia:
|
56 | 79 | # input 6.8GB compressed
|
57 | 80 | # output 200MB compressed
|
58 |
| - # Output columns: pl_title, count |
| 81 | + # Output columns: lt_title (from linktarget file), count (unique pl_from) |
59 | 82 |
|
60 | 83 | unpigz -c $DOWNLOADED_PATH/$LANG/pagelinks.sql.gz | \
|
61 | 84 | bin/mysqldump_to_csv.py | \
|
62 |
| - bin/filter_pagelinks.py | \ |
| 85 | + bin/filter_pagelinks.py $CONVERTED_PATH/$LANG/linktarget.csv.gz | \ |
63 | 86 | pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz
|
64 | 87 |
|
65 | 88 |
|
66 | 89 | echo "[language $LANG] langlinks table SQL => CSV"
|
| 90 | + # https://www.mediawiki.org/wiki/Manual:Langlinks_table |
| 91 | + # |
67 | 92 | # CREATE TABLE `langlinks` (
|
68 | 93 | # `ll_from` int(8) unsigned NOT NULL DEFAULT 0,
|
69 | 94 | # `ll_lang` varbinary(35) NOT NULL DEFAULT '',
|
|
81 | 106 | pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz
|
82 | 107 |
|
83 | 108 |
|
| 109 | + |
| 110 | + |
84 | 111 | echo "[language $LANG] redirect table SQL => CSV"
|
| 112 | + # https://www.mediawiki.org/wiki/Manual:Redirect_table |
| 113 | + # |
85 | 114 | # CREATE TABLE `redirect` (
|
86 | 115 | # `rd_from` int(8) unsigned NOT NULL DEFAULT 0,
|
87 | 116 | # `rd_namespace` int(11) NOT NULL DEFAULT 0,
|
|
0 commit comments