forked from hazelcast/hazelcast-jet-code-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordCount.java
127 lines (116 loc) · 4.82 KB
/
WordCount.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/*
* Copyright (c) 2008-2018, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.hazelcast.core.IMap;
import com.hazelcast.jet.Jet;
import com.hazelcast.jet.JetInstance;
import com.hazelcast.jet.config.InstanceConfig;
import com.hazelcast.jet.config.JetConfig;
import com.hazelcast.jet.pipeline.Pipeline;
import com.hazelcast.jet.pipeline.Sinks;
import com.hazelcast.jet.pipeline.Sources;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import static com.hazelcast.jet.Traversers.traverseArray;
import static com.hazelcast.jet.aggregate.AggregateOperations.counting;
import static com.hazelcast.jet.function.DistributedFunctions.wholeItem;
import static java.lang.Runtime.getRuntime;
import static java.util.Comparator.comparingLong;
/**
* Demonstrates a simple Word Count job in the Pipeline API. Inserts the
* text of The Complete Works of William Shakespeare into a Hazelcast
* IMap, then lets Jet count the words in it and write its findings to
* another IMap. The example looks at Jet's output and prints the 100 most
* frequent words.
*/
public class WordCount {
private static final String BOOK_LINES = "bookLines";
private static final String COUNTS = "counts";
private JetInstance jet;
private static Pipeline buildPipeline() {
Pattern delimiter = Pattern.compile("\\W+");
Pipeline p = Pipeline.create();
p.drawFrom(Sources.<Long, String>map(BOOK_LINES))
.flatMap(e -> traverseArray(delimiter.split(e.getValue().toLowerCase())))
.filter(word -> !word.isEmpty())
.groupingKey(wholeItem())
.aggregate(counting())
.drainTo(Sinks.map(COUNTS));
return p;
}
public static void main(String[] args) throws Exception {
System.setProperty("hazelcast.logging.type", "log4j");
new WordCount().go();
}
/**
* This code illustrates a few more things about Jet, new in 0.5. See comments.
*/
private void go() {
try {
setup();
System.out.print("\nCounting words... ");
long start = System.nanoTime();
Pipeline p = buildPipeline();
jet.newJob(p).join();
System.out.print("done in " + TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start) + " milliseconds.");
printResults();
IMap<String, Long> counts = jet.getMap(COUNTS);
if (counts.get("the") != 27_843) {
throw new AssertionError("Wrong count of 'the'");
}
System.out.println("Count of 'the' is valid");
} finally {
Jet.shutdownAll();
}
}
private void setup() {
JetConfig cfg = new JetConfig();
cfg.setInstanceConfig(new InstanceConfig().setCooperativeThreadCount(
Math.max(1, getRuntime().availableProcessors() / 2)));
System.out.println("Creating Jet instance 1");
jet = Jet.newJetInstance(cfg);
System.out.println("Creating Jet instance 2");
Jet.newJetInstance(cfg);
System.out.println("Loading The Complete Works of William Shakespeare");
try {
IMap<Long, String> bookLines = jet.getMap(BOOK_LINES);
long[] lineNum = {0};
Path book = Paths.get(getClass().getResource("books/shakespeare-complete-works.txt").toURI());
Files.lines(book).forEach(line -> bookLines.put(++lineNum[0], line));
} catch (IOException | URISyntaxException e) {
throw new RuntimeException(e);
}
}
private void printResults() {
final int limit = 100;
System.out.format(" Top %d entries are:%n", limit);
final Map<String, Long> counts = jet.getMap(COUNTS);
System.out.println("/-------+---------\\");
System.out.println("| Count | Word |");
System.out.println("|-------+---------|");
counts.entrySet().stream()
.sorted(comparingLong(Entry<String, Long>::getValue).reversed())
.limit(limit)
.forEach(e -> System.out.format("|%6d | %-8s|%n", e.getValue(), e.getKey()));
System.out.println("\\-------+---------/");
}
}