-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.h
120 lines (104 loc) · 3.44 KB
/
utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#ifndef UTILS_H
#define UTILS_H
#include <string>
#include <unicode/translit.h>
#include <unicode/regex.h>
#include <vector>
#include <algorithm>
namespace casimiro {
typedef std::vector<std::string> StringVector;
extern UErrorCode status;
extern Transliterator* TRANSLITERATOR;
extern RegexMatcher* LAUGH_MATCHER;
extern RegexMatcher* LINE_BREAK_TAB_MATCHER;
extern RegexMatcher* SMILEYS_MATCHER;
extern RegexMatcher* URL_MATCHER;
extern RegexMatcher* MENTION_MATCHER;
extern RegexMatcher* WORD_MATCHER;
extern RegexMatcher* STOP_WORDS_MATCHER;
inline std::string ReplaceNonAsciiChars(const std::string& _dirty)
{
std::string cleaned;
auto uCleaned = UnicodeString::fromUTF8(_dirty.c_str());
TRANSLITERATOR->transliterate(uCleaned);
uCleaned.toUTF8String<std::string>(cleaned);
return cleaned;
}
inline std::string RemoveLineBreakAndTabulations(const std::string& _dirty)
{
std::string cleaned;
auto uDirty = UnicodeString::fromUTF8(_dirty.c_str());
LINE_BREAK_TAB_MATCHER->reset(uDirty);
auto uCleaned = LINE_BREAK_TAB_MATCHER->replaceAll(" ", status);
uCleaned.toUTF8String<std::string>(cleaned);
return cleaned;
}
inline std::string RemoveSmileys(const std::string& _dirty)
{
std::string cleaned;
auto uDirty = UnicodeString::fromUTF8(_dirty.c_str());
SMILEYS_MATCHER->reset(uDirty);
auto uCleaned = SMILEYS_MATCHER->replaceAll("", status);
uCleaned.toUTF8String<std::string>(cleaned);
return cleaned;
}
inline std::string RemoveLaughs(const std::string& _dirty)
{
std::string cleaned;
auto uDirty = UnicodeString::fromUTF8(_dirty.c_str());
LAUGH_MATCHER->reset(uDirty);
auto uCleaned = LAUGH_MATCHER->replaceAll("", status);
uCleaned.toUTF8String<std::string>(cleaned);
return cleaned;
}
inline std::string RemoveURLs(const std::string& _dirty)
{
std::string cleaned;
auto uDirty = UnicodeString::fromUTF8(_dirty.c_str());
URL_MATCHER->reset(uDirty);
auto uCleaned = URL_MATCHER->replaceAll("", status);
uCleaned.toUTF8String<std::string>(cleaned);
return cleaned;
}
inline std::string RemoveMentions(const std::string& _dirty)
{
std::string cleaned;
auto uDirty = UnicodeString::fromUTF8(_dirty.c_str());
MENTION_MATCHER->reset(uDirty);
auto uCleaned = MENTION_MATCHER->replaceAll("", status);
uCleaned.toUTF8String<std::string>(cleaned);
return cleaned;
}
inline std::string RemoveStopWords(const std::string& _dirty)
{
std::string cleaned;
auto uDirty = UnicodeString::fromUTF8(_dirty.c_str());
STOP_WORDS_MATCHER->reset(uDirty);
auto uCleaned = STOP_WORDS_MATCHER->replaceAll("", status);
uCleaned.toUTF8String<std::string>(cleaned);
return cleaned;
}
inline StringVector GetWordsFromText(const std::string& _text)
{
StringVector words;
auto cleaned = ReplaceNonAsciiChars(_text);
std::transform(cleaned.begin(), cleaned.end(), cleaned.begin(), tolower);
cleaned = RemoveSmileys(cleaned);
cleaned = RemoveLineBreakAndTabulations(cleaned);
cleaned = RemoveLaughs(cleaned);
cleaned = RemoveURLs(cleaned);
cleaned = RemoveMentions(cleaned);
cleaned = RemoveStopWords(cleaned);
auto uText = UnicodeString::fromUTF8(cleaned.c_str());
WORD_MATCHER->reset(uText);
while(WORD_MATCHER->find())
{
std::string aux;
auto group = WORD_MATCHER->group(status);
group.toUTF8String<std::string>(aux);
words.push_back(aux);
}
return words;
}
}
#endif //UTILS_H