-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathrun_index_hadoop.sh
executable file
·119 lines (94 loc) · 4.26 KB
/
run_index_hadoop.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash
if [ $# -lt 2 ]; then
cat <<"EOF"
$0 <year-week-of-crawl> <path-to-warc-file-list> [<split_file>]
Create a Common Crawl index for a monthly crawl. All steps are run on Hadoop.
<year-week-of-crawl> year and week of the monthly crawl to be indexed, e.g. 2016-44
used to determine location of the index
s3://commoncrawl/cc-index/collections/CC-MAIN-2016-44/...
<path-to-warc-file-list> list of WARC file objects to be indexed, e.g, the WARC list
s3://commoncrawl/crawl-data/CC-MAIN-2016-44/warc.paths.gz
Paths in the list must be keys/objects in the Common Crawl bucket.
The path to the list must be a valid and complete HDFS or S3A URL,
e.g. hdfs://hdfs-master.example.com/user/hadoop-user/CC-MAIN-2016-44.paths
The "index warcs" step is skipped if an empty string is passed as argument.
<split_file> Optional split file to be reused from previous crawl with similar distribution of URLs.
If not given, splits are calculated and saved on the default split file path.
Environment variables depend upon:
AWS_ACCESS_KEY_ID - AWS credentials used by Boto to access the bucket (read and write)
AWS_SECRET_ACCESS_KEY
EOF
exit 1
fi
if [ -z "$AWS_ACCESS_KEY_ID" ] || [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
echo "AWS credentials must passed to Boto via environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY!"
exit 1
fi
YEARWEEK="$1"
WARC_MANIFEST="$2"
REUSE_SPLIT_FILE="$3"
echo "Generating cc-index for $YEARWEEK"
echo
echo WARC_MANIFEST="$WARC_MANIFEST"
echo
export WARC_CDX="s3a://commoncrawl/cc-index/cdx/CC-MAIN-$YEARWEEK/segments/*/*/*.cdx.gz"
export WARC_CDX_BUCKET="commoncrawl"
export ZIPNUM_CLUSTER_DIR="s3a://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/"
# SPLIT_FILE could be reused from previous crawl with similar distribution of URLs, see REUSE_SPLIT_FILE
export SPLIT_FILE="s3a://cc-cdx-index/${YEARWEEK}_splits.seq"
export LC_ALL=C
set -e
set -x
if [ -n "$WARC_MANIFEST" ]; then
python indexwarcsjob.py \
--cdx_bucket=$WARC_CDX_BUCKET \
--no-output \
--cleanup NONE \
--skip-existing \
--cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-r hadoop \
--jobconf "mapreduce.map.memory.mb=800" \
--jobconf "mapreduce.map.java.opts=-Xmx512m" \
$WARC_MANIFEST
fi
if [ -n "$REUSE_SPLIT_FILE" ]; then
SPLIT_FILE="$REUSE_SPLIT_FILE"
else
# mapreduce.output.fileoutputformat.compress=true
# must compress task output to avoid that the single reducer node fails with a full disk
# anyway, it may require 60 GB of local disk space on the reducer node
# mapreduce.map.memory.mb=640
# mappers read only small cdx files: minimal memory requirements
python dosample.py \
--verbose \
--shards=300 \
--splitfile=$SPLIT_FILE \
--cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--jobconf "mapreduce.map.memory.mb=640" \
--jobconf "mapreduce.map.java.opts=-Xmx512m" \
--jobconf "mapreduce.reduce.memory.mb=1024" \
--jobconf "mapreduce.reduce.java.opts=-Xmx512m" \
--jobconf "mapreduce.output.fileoutputformat.compress=true" \
-r hadoop $WARC_CDX
mv splits.seq $(basename s3${SPLIT_FILE#s3a})
if s3cmd info s3${SPLIT_FILE#s3a}; then
echo "Ok, split file was upload"
else
echo "Uploading split file ..."
s3cmd put $(basename s3${SPLIT_FILE#s3a}) s3${SPLIT_FILE#s3a}
fi
fi
python zipnumclusterjob.py \
--shards=300 \
--splitfile=$SPLIT_FILE \
--output-dir="$ZIPNUM_CLUSTER_DIR" \
--no-output \
--cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--jobconf "mapreduce.map.memory.mb=640" \
--jobconf "mapreduce.map.java.opts=-Xmx512m" \
--jobconf "mapreduce.reduce.memory.mb=1536" \
--jobconf "mapreduce.reduce.java.opts=-Xmx1024m" \
-r hadoop $WARC_CDX