Skip to content

Commit 03de171

Browse files
committed
Merge pull request #92 from sesteel/master
Removed regex use from plain text tokenizer
2 parents f1d10aa + 1a364de commit 03de171

File tree

4 files changed

+27
-18
lines changed

4 files changed

+27
-18
lines changed

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Project details
22
group=com.readytalk
3-
version=2.0.0
3+
version=2.0.1
44

55

66
# Optimize the build environment

src/integTest/java/com/readytalk/swt/text/painter/TextPainterIntegTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ public void setup() {
3131
}
3232

3333
@Test
34-
public void setText_ConfiguredToTokenizePlainText_SeventyOneTokens () {
34+
public void setText_ConfiguredToTokenizePlainText_SixtyNineTokens () {
3535
painter.setText(PLAIN_TEXT);
36-
assertEquals(71, painter.getTokens().size());
36+
assertEquals(69, painter.getTokens().size());
3737
}
3838

3939
@Test

src/main/java/com/readytalk/swt/text/tokenizer/PlainTextTokenizer.java

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,6 @@
77

88
class PlainTextTokenizer implements TextTokenizer {
99

10-
private static final String NEWLINE_SAFETY_TOKEN = "<:br:>";
11-
private static final String SPACE_SAFETY_TOKEN = "<:sp:>";
12-
private static final String SPACE_SPLITABLE_TOKEN = "<:ssp:>";
13-
private static final String NEWLINE = "\n";
14-
private static final String WHITESPACE_REGEX = "\\s+";
15-
1610
private List<TextToken> tokens = new ArrayList<TextToken>();
1711

1812
@Override
@@ -21,23 +15,38 @@ public TextTokenizer reset() {
2115
return this;
2216
}
2317

18+
/**
19+
* Tokenize will accept a string and return a list of TextTokens.
20+
* The algorithm used by PlaintextTokenizer#tokenize will convert all
21+
* whitespace to a ' ' character. Furthermore, subsequent whitespace
22+
* characters will be collapsed into a single ' ' character.
23+
*
24+
* @param text
25+
* @return List
26+
*/
2427
public List<TextToken> tokenize(String text) {
2528

2629
if (text == null || "".equals(text)) {
2730
return tokens;
2831
}
2932

30-
String str = text.replace(NEWLINE, NEWLINE_SAFETY_TOKEN);
31-
str = str.replaceAll(WHITESPACE_REGEX, SPACE_SPLITABLE_TOKEN);
32-
for (String s : str.split(SPACE_SPLITABLE_TOKEN)) {
33-
if (s.contains(NEWLINE_SAFETY_TOKEN)) {
34-
tokens.add(new TextToken(TextType.TEXT, s.replace(NEWLINE_SAFETY_TOKEN, NEWLINE)));
33+
StringBuffer sb = new StringBuffer();
34+
char[] chars = text.toCharArray();
35+
for (char c : chars) {
36+
if (!Character.isWhitespace(c)) {
37+
sb.append(c);
3538
} else {
36-
tokens.add(new TextToken(TextType.TEXT, s));
37-
tokens.add(new TextToken(TextType.WHITESPACE, " "));
39+
if (sb.length() > 0) {
40+
tokens.add(new TextToken(TextType.TEXT, sb.toString()));
41+
tokens.add(new TextToken(TextType.WHITESPACE, " "));
42+
sb = new StringBuffer();
43+
}
3844
}
3945
}
4046

47+
if (sb.length() > 0) {
48+
tokens.add(new TextToken(TextType.TEXT, sb.toString()));
49+
}
4150
return tokens;
4251
}
4352
}

src/test/java/com/readytalk/swt/text/tokenizer/TextTokenizerTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public void tokenize_MultipleTokenizeCalls_TokenListContainsMoreElements() {
2121
textTokenizer.tokenize(PLAIN_TEXT);
2222
textTokenizer.tokenize(PLAIN_TEXT);
2323
textTokenizer.tokenize(PLAIN_TEXT);
24-
Assert.assertEquals(284, textTokenizer.tokenize(PLAIN_TEXT).size());
24+
Assert.assertEquals(276, textTokenizer.tokenize(PLAIN_TEXT).size());
2525
}
2626

2727
@Test
@@ -30,6 +30,6 @@ public void reset_MultipleTokenizeCalls_ResetClearsInternalList() {
3030
textTokenizer.tokenize(PLAIN_TEXT);
3131
textTokenizer.tokenize(PLAIN_TEXT);
3232
textTokenizer.reset();
33-
Assert.assertEquals(71, textTokenizer.tokenize(PLAIN_TEXT).size());
33+
Assert.assertEquals(69, textTokenizer.tokenize(PLAIN_TEXT).size());
3434
}
3535
}

0 commit comments

Comments
 (0)