forked from advancedlogic/GoOse
-
Notifications
You must be signed in to change notification settings - Fork 2
/
goose.go
41 lines (35 loc) · 1.03 KB
/
goose.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
package goose
import (
"github.com/pkg/errors"
)
// Goose is the main entry point of the program
type Goose struct {
config Configuration
}
// New returns a new instance of the article extractor
func New(args ...string) Goose {
return Goose{
config: GetDefaultConfiguration(args...),
}
}
// NewWithConfig returns a new instance of the article extractor with configuration
func NewWithConfig(config Configuration) Goose {
return Goose{
config,
}
}
// ExtractFromURL follows the URL, fetches the HTML page and returns an article object
func (g Goose) ExtractFromURL(url string) (*Article, error) {
HtmlRequester := NewHtmlRequester(g.config)
html, err := HtmlRequester.fetchHTML(url)
if err != nil {
return nil, errors.Wrap(err, "could not get htnk from site")
}
cc := NewCrawler(g.config)
return cc.Crawl(html, url)
}
// ExtractFromRawHTML returns an article object from the raw HTML content
func (g Goose) ExtractFromRawHTML(RawHTML string, url string) (*Article, error) {
cc := NewCrawler(g.config)
return cc.Crawl(RawHTML, url)
}