-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: allan716 <[email protected]>
- Loading branch information
1 parent
7a7616f
commit dd35373
Showing
9 changed files
with
365 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
package rod_helper | ||
|
||
import ( | ||
"github.com/WQGroup/logger" | ||
"github.com/go-rod/rod" | ||
"github.com/go-rod/rod/lib/proto" | ||
"github.com/pkg/errors" | ||
"math/rand" | ||
"os" | ||
"path/filepath" | ||
"time" | ||
) | ||
|
||
func GetFakeUserAgentDataCache(tmpRootFolder, httpProxyURL string) error { | ||
|
||
/* | ||
暂时只获取: | ||
1. Browsers | ||
子分类中,再细化一点,只获取: | ||
// 桌面浏览器 | ||
1. Chrome | ||
2. Edge | ||
3. Firefox | ||
4. Opera | ||
5. Safari | ||
6. Mozilla | ||
*/ | ||
// 直接查找所有的 A 的链接,然后读取 href 信息,匹配 <a href="/pages/Chrome/ " class="unterMenuName">Chrome</a> | ||
nowBrowser, err := NewBrowserBase(tmpRootFolder, "", httpProxyURL, true, false) | ||
if err != nil { | ||
return err | ||
} | ||
defer nowBrowser.Close() | ||
var nowPage *rod.Page | ||
nowPage, err = NewPage(nowBrowser.Browser) | ||
if err != nil { | ||
return err | ||
} | ||
defer func() { | ||
_ = nowPage.Close() | ||
}() | ||
|
||
err = parseUAAllPage(nowPage) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func parseUAAllPage(nowPage *rod.Page) error { | ||
|
||
// 所有的 UA 的 SubType 都在这里 | ||
const allInfoPage = "https://useragentstring.com/pages/useragentstring.php" | ||
var err error | ||
var p *proto.NetworkResponseReceived | ||
nowPage, p, err = PageNavigate(nowPage, allInfoPage, 15*time.Second) | ||
if err != nil { | ||
return err | ||
} | ||
statusCode := StatusCodeInfo{ | ||
Codes: []int{403}, | ||
Operator: Match, | ||
WillDo: Skip, | ||
NeedPunishment: false, | ||
} | ||
StatusCodeCheck, err := PageStatusCodeCheck( | ||
p, | ||
[]StatusCodeInfo{statusCode}) | ||
if err != nil { | ||
return err | ||
} | ||
switch StatusCodeCheck { | ||
case Skip, Repeat: | ||
// 跳过后续的逻辑,不需要再次访问 | ||
return errors.New("StatusCodeCheck Error") | ||
} | ||
pageAllXPath := "//*[@id=\"menu\"]/a[2]" | ||
pageLoaded := HasPageLoaded(nowPage, pageAllXPath, 15) | ||
if pageLoaded == false { | ||
return errors.New("HasPageLoaded == false") | ||
} | ||
|
||
uaUrlMap := make(map[string][]string, 0) | ||
uaResultMap := make(map[string][]string, 0) | ||
|
||
err = rod.Try(func() { | ||
// 遍历所有的 A 的链接,然后读取 href 信息,匹配 <a href="/pages/Chrome/ " class="unterMenuName">Chrome</a> | ||
aEls := nowPage.MustElements("a") | ||
for i, el := range aEls { | ||
|
||
elString := el.MustText() | ||
if isSupportUAName(elString) == false { | ||
continue | ||
} | ||
nowElUrlPath := el.MustProperty("href") | ||
logger.Infoln(i, elString, nowElUrlPath) | ||
_, found := uaUrlMap[elString] | ||
if found == false { | ||
uaUrlMap[elString] = make([]string, 0) | ||
} | ||
uaUrlMap[elString] = append(uaUrlMap[elString], nowElUrlPath.String()) | ||
} | ||
}) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for uaName, uaUrls := range uaUrlMap { | ||
|
||
uaResultMap[uaName] = make([]string, 0) | ||
for index, uaUrl := range uaUrls { | ||
|
||
logger.Infoln(uaName, index, uaUrl) | ||
nowPage, p, err = PageNavigate(nowPage, uaUrl, 15*time.Second) | ||
if err != nil { | ||
return err | ||
} | ||
StatusCodeCheck, err = PageStatusCodeCheck( | ||
p, | ||
[]StatusCodeInfo{statusCode}) | ||
if err != nil { | ||
return err | ||
} | ||
switch StatusCodeCheck { | ||
case Skip, Repeat: | ||
// 跳过后续的逻辑,不需要再次访问 | ||
return errors.New("StatusCodeCheck Error") | ||
} | ||
pageLoaded = HasPageLoaded(nowPage, pageAllXPath, 15) | ||
if pageLoaded == false { | ||
return errors.New("HasPageLoaded == false") | ||
} | ||
err = rod.Try(func() { | ||
// 遍历所有的 ul ,然后再次遍历 ul 中的 A 的链接 | ||
aULs := nowPage.MustElements("ul") | ||
for _, ul := range aULs { | ||
|
||
aEls := ul.MustElements("a") | ||
for _, aEl := range aEls { | ||
|
||
uaString := aEl.MustText() | ||
uaResultMap[uaName] = append(uaResultMap[uaName], uaString) | ||
} | ||
} | ||
}) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
} | ||
// 当前的目录下缓存下来 | ||
saveRootPath := filepath.Join(".", "cache", "ua") | ||
if IsDir(saveRootPath) == false { | ||
err = os.MkdirAll(saveRootPath, os.ModePerm) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
// 根据查询到的结果,写入本地的缓存 | ||
for uaName, results := range uaResultMap { | ||
nowInfo := UserAgentInfo{ | ||
UserAgentMainType: Browsers, | ||
SubType: uaName, | ||
UserAgents: results, | ||
} | ||
|
||
desSaveFPath := filepath.Join(saveRootPath, uaName+".json") | ||
logger.Infoln("uaName:", uaName, desSaveFPath) | ||
err = ToFile(desSaveFPath, nowInfo) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func RandomUserAgent(tmpRootFolder, httpProxyURL string) string { | ||
|
||
var err error | ||
// 是否已经读取过本地的缓存 | ||
if len(allUANames) > 0 { | ||
return allUANames[rand.Intn(len(allUANames))] | ||
} | ||
// 查看本地是否有缓存 | ||
uaRootPath := filepath.Join(".", "cache", "ua") | ||
if IsDir(uaRootPath) == false { | ||
err = GetFakeUserAgentDataCache(tmpRootFolder, httpProxyURL) | ||
if err != nil { | ||
logger.Panicln(err) | ||
} | ||
} | ||
for i, subType := range subTypes { | ||
uaFilePath := filepath.Join(uaRootPath, subType+".json") | ||
if IsFile(uaFilePath) == false { | ||
err = GetFakeUserAgentDataCache(tmpRootFolder, httpProxyURL) | ||
if err != nil { | ||
logger.Panicln(err) | ||
} | ||
} | ||
uaInfo := UserAgentInfo{} | ||
err = ToStruct(uaFilePath, &uaInfo) | ||
if err != nil { | ||
logger.Panicln(err) | ||
} | ||
allUANames = append(allUANames, uaInfo.UserAgents...) | ||
logger.Infoln(i, subType, len(uaInfo.UserAgents)) | ||
} | ||
|
||
// 是否已经读取过本地的缓存 | ||
if len(allUANames) > 0 { | ||
return allUANames[rand.Intn(len(allUANames))] | ||
} else { | ||
logger.Panicln("RandomUserAgent is empty") | ||
} | ||
|
||
return "" | ||
} | ||
|
||
type UserAgentInfo struct { | ||
UserAgentMainType UserAgentMainType // 主要的分类 | ||
SubType string // 比如是浏览器的分类中,可以是 Chrome 或者是 Firefox | ||
UserAgents []string // 这个子分类中有那些 UserAgent | ||
} | ||
|
||
type UserAgentMainType int | ||
|
||
const ( | ||
All UserAgentMainType = iota + 1 | ||
Crawlers | ||
Browsers | ||
MobileBrowsers | ||
Consoles | ||
OfflineBrowsers | ||
EMailClients | ||
LinkCheckers | ||
EMailCollectors | ||
Validators | ||
FeedReaders | ||
Libraries | ||
CloudPlatforms | ||
Ohters | ||
) | ||
|
||
const ( | ||
Chrome = "Chrome" | ||
Edge = "Edge" | ||
Firefox = "Firefox" | ||
Opera = "Opera" | ||
Safari = "Safari" | ||
Mozilla = "Mozilla" | ||
) | ||
|
||
func isSupportUAName(inName string) bool { | ||
|
||
switch inName { | ||
case Chrome, Edge, Firefox, Opera, Safari, Mozilla: | ||
return true | ||
} | ||
return false | ||
} | ||
|
||
var ( | ||
allUANames []string | ||
subTypes = []string{ | ||
Chrome, | ||
Edge, | ||
Firefox, | ||
Opera, | ||
Safari, | ||
Mozilla, | ||
} | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package rod_helper | ||
|
||
import "testing" | ||
|
||
func TestGetFakeUserAgentDataCache(t *testing.T) { | ||
|
||
err := GetFakeUserAgentDataCache("C:\\Tmp", "http://192.168.50.252:20171") | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package rod_helper | ||
|
||
import ( | ||
"crypto/tls" | ||
"github.com/go-resty/resty/v2" | ||
"net/http" | ||
"time" | ||
) | ||
|
||
// NewHttpClient 新建一个 resty 的对象 | ||
func NewHttpClient(opt *HttpClientOptions) (*resty.Client, error) { | ||
//const defUserAgent = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" | ||
//const defUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41" | ||
|
||
var UserAgent string | ||
// ------------------------------------------------ | ||
// 随机的 Browser | ||
UserAgent = RandomUserAgent(opt.TmpRootFolder, opt.HttpProxyUrl) | ||
// ------------------------------------------------ | ||
httpClient := resty.New().SetTransport(&http.Transport{ | ||
DisableKeepAlives: true, | ||
MaxIdleConns: 1000, | ||
MaxIdleConnsPerHost: 1000, | ||
}) | ||
httpClient.SetTimeout(opt.HTMLTimeOut) | ||
httpClient.SetRetryCount(1) | ||
// ------------------------------------------------ | ||
// 设置 Referer | ||
if len(opt.Referer) > 0 { | ||
httpClient.SetHeader("Referer", opt.Referer) | ||
} | ||
// ------------------------------------------------ | ||
// 设置 Header | ||
httpClient.SetHeaders(map[string]string{ | ||
"Content-Type": "application/json", | ||
"User-Agent": UserAgent, | ||
}) | ||
// ------------------------------------------------ | ||
// 不要求安全链接 | ||
httpClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true}) | ||
// ------------------------------------------------ | ||
// http 代理 | ||
if opt.HttpProxyUrl != "" { | ||
httpClient.SetProxy(opt.HttpProxyUrl) | ||
} else { | ||
httpClient.RemoveProxy() | ||
} | ||
|
||
return httpClient, nil | ||
} | ||
|
||
type HttpClientOptions struct { | ||
TmpRootFolder string | ||
HTMLTimeOut time.Duration | ||
HttpProxyUrl string | ||
Referer string | ||
} | ||
|
||
func NewHttpClientOptions(tmpRootFolder string, HTMLTimeOut time.Duration, httpProxyUrl string, referer string) *HttpClientOptions { | ||
return &HttpClientOptions{ | ||
TmpRootFolder: tmpRootFolder, | ||
HTMLTimeOut: HTMLTimeOut, | ||
HttpProxyUrl: httpProxyUrl, | ||
Referer: referer, | ||
} | ||
} |
Oops, something went wrong.