Skip to content

Commit d8abcfc

Browse files
committedMay 4, 2018
🔥 🔥 🔥 Initial commit 🔥🔥🔥
0 parents  commit d8abcfc

File tree

6 files changed

+966
-0
lines changed

6 files changed

+966
-0
lines changed
 

‎.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/out/
2+
/lib/
3+
/.idea/
4+
/untitled2.iml

‎LICENSE

+674
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package de.philipppixel.markov;
2+
3+
/**
4+
* This interface provides lines of strings (sentences) to the markov trainer.
5+
*/
6+
public interface LineFeeder extends Iterable<String> {
7+
}
+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package de.philipppixel.markov;
2+
3+
import java.io.IOException;
4+
import java.nio.file.Files;
5+
import java.nio.file.Paths;
6+
import java.util.Iterator;
7+
import java.util.List;
8+
import java.util.Spliterator;
9+
import java.util.function.Consumer;
10+
import java.util.stream.Collectors;
11+
import java.util.stream.Stream;
12+
13+
public class ListFeeder implements LineFeeder {
14+
private String filePath;
15+
16+
public ListFeeder(String filePath) {
17+
if (filePath == null || filePath.isEmpty()) throw new IllegalArgumentException("filePath must not be empty.");
18+
this.filePath = filePath;
19+
}
20+
21+
private List<String> readFile() {
22+
try (Stream<String> stream = Files.lines(Paths.get(filePath))) {
23+
return stream.collect(Collectors.toList());
24+
} catch (IOException e) {
25+
throw new RuntimeException("Exception during reading file " + filePath, e);
26+
}
27+
}
28+
29+
@Override
30+
public Iterator<String> iterator() {
31+
return readFile().iterator();
32+
}
33+
34+
@Override
35+
public void forEach(Consumer<? super String> action) {
36+
readFile().forEach(action);
37+
}
38+
39+
@Override
40+
public Spliterator<String> spliterator() {
41+
return readFile().spliterator();
42+
}
43+
}
+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
package de.philipppixel.markov;
2+
3+
import java.util.*;
4+
import java.util.logging.Logger;
5+
6+
/**
7+
* A sliding window markov chain text generator.
8+
* <p>
9+
* <code>first second third.</code>: A window size of two takes two subsequent words and maps the following of it:
10+
* "first second" -&gt; third. "first second" is called a <code>prefix</code> while "third" is the <code>suffix</code>.
11+
*
12+
* Multiple occurrences of the same word possible and desired because the selection
13+
* probability rises later on.
14+
*
15+
* What is
16+
*/
17+
public class Markov2 {
18+
private static final int WINDOW_SIZE = 2;
19+
private static final int NULL_SAFE_RETRIES = 5;
20+
private static final int NUMBER_OF_SENTENCES = 150;
21+
private static final String SENTENCE_DELIMITER = ".";
22+
private static final String WORD_DELIMITER = " ";
23+
private static final String EMPTY_RESULT = "";
24+
private static final Logger LOG = Logger.getLogger(Markov2.class.getName());
25+
26+
private Map<String, List<String>> trainingMap = new HashMap<>();
27+
28+
public static void main(String[] args) {
29+
String filePath = args[0];
30+
if (filePath == null || filePath.isEmpty()) {
31+
throw new IllegalArgumentException("Please call with absolute file path as only argument");
32+
}
33+
34+
Markov2 app = new Markov2();
35+
LineFeeder feeder = new ListFeeder(filePath);
36+
app.train(feeder);
37+
38+
LOG.info("============ histogram " + app.trainingMap);
39+
40+
for (int i = 0; i < NUMBER_OF_SENTENCES; i++) {
41+
System.out.println(app.generate());
42+
}
43+
}
44+
45+
public void train(LineFeeder feeder) {
46+
for (String line : feeder) {
47+
trainSingleLine(line.toLowerCase());
48+
}
49+
}
50+
51+
private void trainSingleLine(String trainingLine) {
52+
String[] words = trainingLine.split(WORD_DELIMITER);
53+
Queue<String> recentWords = new ArrayDeque<>(WINDOW_SIZE);
54+
55+
for (int wordIndex = 0; wordIndex < words.length - 1; wordIndex++) {
56+
String currentWord = words[wordIndex];
57+
LOG.info("looking at " + currentWord);
58+
recentWords.add(currentWord); //tail
59+
60+
LOG.info("has suff data? " + recentWords.size());
61+
if (!hasSufficientData(recentWords)) {
62+
LOG.info("skip this round");
63+
continue;
64+
}
65+
66+
String prefix = mergePrefix(recentWords);
67+
LOG.info("Looking at prefix: " + prefix);
68+
List<String> suffixes = trainingMap.getOrDefault(prefix, new ArrayList<>());
69+
String suffixToAdd = words[wordIndex + 1];
70+
LOG.info("looking at suffix to add: " + suffixToAdd);
71+
suffixes.add(suffixToAdd);
72+
trainingMap.put(prefix, suffixes);
73+
LOG.info("map: " + trainingMap);
74+
75+
recentWords.remove(); //head
76+
}
77+
}
78+
79+
private boolean hasSufficientData(Queue<String> recentWords) {
80+
int currentPrefixSize = recentWords.size();
81+
if (currentPrefixSize > WINDOW_SIZE) {
82+
throw new IllegalStateException("Prefix size is too large: " + recentWords);
83+
}
84+
return currentPrefixSize == WINDOW_SIZE;
85+
}
86+
87+
private String mergePrefix(Queue<String> recentWords) {
88+
String merged = "";
89+
90+
for (String word : recentWords) {
91+
merged += word + WORD_DELIMITER;
92+
}
93+
return merged.trim();
94+
}
95+
96+
String generate() {
97+
String prefix = getRandomPrefix();
98+
String result = prefix;
99+
100+
int numberOfWordsPerSentence = 16;
101+
for (int i = 0; i < numberOfWordsPerSentence; i++) {
102+
String suffix = getRandomSuffix(prefix);
103+
result += WORD_DELIMITER + suffix;
104+
105+
if (suffix.equals(EMPTY_RESULT)) {
106+
break;
107+
}
108+
109+
prefix = createNewPrefix(prefix, suffix);
110+
}
111+
112+
return result.trim() + SENTENCE_DELIMITER;
113+
}
114+
115+
private String createNewPrefix(String prefix, String suffix) {
116+
String[] prefixes = prefix.split(WORD_DELIMITER);
117+
118+
Queue<String> prefixesAsQueue = new ArrayDeque<>(Arrays.asList(prefixes));
119+
prefixesAsQueue.add(suffix); // tail
120+
prefixesAsQueue.remove(); // head
121+
122+
return mergePrefix(prefixesAsQueue);
123+
}
124+
125+
private String getRandomSuffix(String prefix) {
126+
List<String> suffixes = trainingMap.get(prefix);
127+
128+
if (suffixes == null) {
129+
return EMPTY_RESULT;
130+
}
131+
int index = new Random().nextInt(suffixes.size());
132+
return suffixes.get(index);
133+
}
134+
135+
private String getRandomPrefix() {
136+
List<String> keys = null;
137+
int retry = 0;
138+
while (keys == null) {
139+
if (++retry >= NULL_SAFE_RETRIES) {
140+
throw new RuntimeException("Prefix fetching is stuck.");
141+
}
142+
keys = new ArrayList<>(trainingMap.keySet());
143+
}
144+
145+
int keyIndex = new Random().nextInt(keys.size());
146+
return keys.get(keyIndex);
147+
}
148+
149+
Map<String, List<String>> getTrainingMap() {
150+
return trainingMap;
151+
}
152+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
package de.philipppixel.markov;
2+
3+
import org.assertj.core.util.Lists;
4+
import org.junit.jupiter.api.Test;
5+
6+
import java.util.*;
7+
import java.util.function.Consumer;
8+
9+
import static org.assertj.core.api.Assertions.assertThat;
10+
11+
class Markov2Test {
12+
private Markov2 sut = new Markov2();
13+
14+
@Test
15+
void trainShouldResultInSingleElementList() {
16+
// given
17+
TestFeeder feeder = new TestFeeder("First Second Third");
18+
19+
// when
20+
sut.train(feeder);
21+
22+
// then
23+
Map<String, List<String>> actual = sut.getTrainingMap();
24+
assertThat(actual).isNotNull();
25+
26+
String expectedPrefix = "First Second".toLowerCase();
27+
ArrayList<String> expectedSuffix = Lists.newArrayList("Third".toLowerCase());
28+
assertThat(actual.get(expectedPrefix)).isEqualTo(expectedSuffix);
29+
}
30+
31+
@Test
32+
void testTrainOnFullText() {
33+
// given
34+
TestFeeder feeder = new TestFeeder("now he is gone she said he is gone for good");
35+
36+
// when
37+
sut.train(feeder);
38+
39+
// then
40+
Map<String, List<String>> actual = sut.getTrainingMap();
41+
assertThat(actual).isNotNull().hasSize(7);
42+
assertThat(actual.get("now he")).isEqualTo(Lists.newArrayList("is"));
43+
assertThat(actual.get("he is")).isEqualTo(Lists.newArrayList("gone", "gone"));
44+
assertThat(actual.get("is gone")).isEqualTo(Lists.newArrayList("she", "for"));
45+
assertThat(actual.get("gone she")).isEqualTo(Lists.newArrayList("said"));
46+
assertThat(actual.get("she said")).isEqualTo(Lists.newArrayList("he"));
47+
assertThat(actual.get("said he")).isEqualTo(Lists.newArrayList("is"));
48+
assertThat(actual.get("gone for")).isEqualTo(Lists.newArrayList("good"));
49+
}
50+
51+
@Test
52+
void generateShouldCreateNonEmptySentences() {
53+
// given
54+
TestFeeder feeder = new TestFeeder("now he is gone she said he is gone for good");
55+
sut.train(feeder);
56+
57+
// when
58+
String actual = sut.generate();
59+
60+
// then
61+
assertThat(actual).matches("[a-zA-Z]+( [a-zA-Z]+)+ ?\\.");
62+
}
63+
64+
private class TestFeeder implements LineFeeder {
65+
private List<String> line;
66+
67+
TestFeeder(String line) {
68+
this.line = Collections.singletonList(line);
69+
}
70+
71+
@Override
72+
public Iterator<String> iterator() {
73+
return line.iterator();
74+
}
75+
76+
@Override
77+
public void forEach(Consumer<? super String> action) {
78+
//
79+
}
80+
81+
@Override
82+
public Spliterator<String> spliterator() {
83+
return null;
84+
}
85+
}
86+
}

0 commit comments

Comments
 (0)
Please sign in to comment.