Skip to content

Commit c5c5000

Browse files
committed
feat(words): add methods to split text into words
1 parent 1ecbe3e commit c5c5000

File tree

5 files changed

+96
-18
lines changed

5 files changed

+96
-18
lines changed

README.md

100644100755
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ As of v1.0.0 the API is stable and used in multiple (personal) projects. Unless
1111

1212
## Change Log
1313

14+
**2019-08-20** v1.2.0 words methods
15+
1416
**2019-02-14** v1.1.0 classes methods
1517

1618
**2019-02-11** v1.0.0 initial release

htmlutil.go

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
See the License for the specific language governing permissions and
1414
limitations under the License.
15-
*/
15+
*/
1616

1717
// Package htmlutil implements a wrapper for Golang's html5 tokeniser / parser implementation, making it much easier to
1818
// find and extract information, aiming to be powerful and intuitive while remaining a minimal and logical extension.
@@ -153,7 +153,14 @@ func (n Node) OuterHTML() string {
153153

154154
// OuterText builds a string from the data of all text nodes in the sub-tree, starting from and including `n`
155155
func (n Node) OuterText() string {
156-
return encodeText(n.Data)
156+
return string(encodeText(n.Data))
157+
}
158+
159+
// OuterWords builds a space-separated string from the whitespace-separated data of all text nodes in the sub-tree,
160+
// starting from and including `n`, note that text separated / split across multiple elements will be considered as
161+
// multiple words (words within non-empty sibling elements will be split by a single space)
162+
func (n Node) OuterWords() string {
163+
return string(encodeWords(n.Data))
157164
}
158165

159166
// InnerHTML builds a string using the outer html of all children matching all filters (see the `FindNode` method)
@@ -182,6 +189,25 @@ func (n Node) InnerText(filters ...func(node Node) bool) string {
182189
return string(b)
183190
}
184191

192+
// InnerWords builds a string using the outer words of all children matching all filters (see the `FindNode` method and
193+
// the `OuterWords` methods)
194+
func (n Node) InnerWords(filters ...func(node Node) bool) string {
195+
var b []byte
196+
n.Range(
197+
func(i int, node Node) bool {
198+
if s := node.OuterWords(); s != `` {
199+
if len(b) != 0 {
200+
b = append(b, ' ')
201+
}
202+
b = append(b, []byte(s)...)
203+
}
204+
return true
205+
},
206+
filters...,
207+
)
208+
return string(b)
209+
}
210+
185211
// SiblingIndex returns the total number of previous siblings matching any filters (see the `FindNode` method)
186212
func (n Node) SiblingIndex(filters ...func(node Node) bool) int {
187213
return siblingIndex(n, filters...)

htmlutil_test.go

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
See the License for the specific language governing permissions and
1414
limitations under the License.
15-
*/
15+
*/
1616

1717
package htmlutil
1818

@@ -339,11 +339,27 @@ func TestEncodeHTML_panic(t *testing.T) {
339339
}
340340

341341
func TestEncodeText_nil(t *testing.T) {
342-
if v := encodeText(nil); v != "" {
342+
if v := encodeText(nil); v != nil {
343343
t.Fatal(v)
344344
}
345345
}
346346

347+
func TestEncodeWords_nil(t *testing.T) {
348+
if v := encodeWords(nil); v != nil {
349+
t.Fatal(v)
350+
}
351+
}
352+
353+
func TestEncodeWords_siblings(t *testing.T) {
354+
node, err := Parse(strings.NewReader(`<div>one</div><div>two</div><div><div><div></div></div></div><div></div><div><div></div><div>three</div></div><div>four</div>`))
355+
if err != nil {
356+
t.Fatal(err)
357+
}
358+
if v := string(encodeWords(node.Data)); v != `one two three four` {
359+
t.Error(v)
360+
}
361+
}
362+
347363
func TestParse_eof(t *testing.T) {
348364
reader, _ := io.Pipe()
349365
_ = reader.Close()
@@ -425,6 +441,20 @@ FOUR !
425441
` {
426442
t.Fatal(v)
427443
}
444+
if v := node.InnerWords(); v != `ONE TWO THREE FOUR !` {
445+
t.Fatal(v)
446+
}
447+
if v := node.InnerWords(func(node Node) bool {
448+
return node.Offset() == 0 &&
449+
node.Type() == html.TextNode
450+
}); v != `FOUR !` {
451+
t.Fatal(v)
452+
}
453+
if v := node.InnerWords(func(node Node) bool {
454+
return node.Offset() == 100
455+
}); v != `` {
456+
t.Fatal(v)
457+
}
428458
}
429459

430460
func TestNode_GetAttr_caseInsensitive(t *testing.T) {

internal.go

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
See the License for the specific language governing permissions and
1414
limitations under the License.
15-
*/
15+
*/
1616

1717
package htmlutil
1818

@@ -154,7 +154,18 @@ func getNode(node Node, filters ...func(node Node) bool) Node {
154154
return result
155155
}
156156

157-
func encodeTextBytes(node *html.Node) []byte {
157+
func encodeHTML(node *html.Node) string {
158+
if node == nil {
159+
return ""
160+
}
161+
buffer := new(bytes.Buffer)
162+
if err := html.Render(buffer, node); err != nil {
163+
panic(err)
164+
}
165+
return buffer.String()
166+
}
167+
168+
func encodeText(node *html.Node) []byte {
158169
if node == nil {
159170
return nil
160171
}
@@ -163,24 +174,33 @@ func encodeTextBytes(node *html.Node) []byte {
163174
}
164175
var b []byte
165176
for node := node.FirstChild; node != nil; node = node.NextSibling {
166-
b = append(b, encodeTextBytes(node)...)
177+
b = append(b, encodeText(node)...)
167178
}
168179
return b
169180
}
170181

171-
func encodeText(node *html.Node) string {
172-
return string(encodeTextBytes(node))
173-
}
174-
175-
func encodeHTML(node *html.Node) string {
182+
func encodeWords(node *html.Node) (b []byte) {
176183
if node == nil {
177-
return ""
184+
return
178185
}
179-
buffer := new(bytes.Buffer)
180-
if err := html.Render(buffer, node); err != nil {
181-
panic(err)
186+
if node.Type == html.TextNode {
187+
for _, word := range strings.Fields(node.Data) {
188+
if len(b) != 0 {
189+
b = append(b, ' ')
190+
}
191+
b = append(b, []byte(word)...)
192+
}
193+
return
182194
}
183-
return buffer.String()
195+
for node := node.FirstChild; node != nil; node = node.NextSibling {
196+
if words := encodeWords(node); len(words) != 0 {
197+
if len(b) != 0 {
198+
b = append(b, ' ')
199+
}
200+
b = append(b, words...)
201+
}
202+
}
203+
return
184204
}
185205

186206
func getAttr(namespace string, key string, attributes ...html.Attribute) (html.Attribute, bool) {

internal_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
See the License for the specific language governing permissions and
1414
limitations under the License.
15-
*/
15+
*/
1616

1717
package htmlutil
1818

0 commit comments

Comments
 (0)