Skip to content

Commit

Permalink
重写随机 UA 获取逻辑
Browse files Browse the repository at this point in the history
Signed-off-by: allan716 <[email protected]>
  • Loading branch information
allanpk716 committed Dec 3, 2022
1 parent 7a7616f commit dd35373
Show file tree
Hide file tree
Showing 9 changed files with 365 additions and 116 deletions.
2 changes: 1 addition & 1 deletion adblock_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import "testing"
func TestGetADBlock(t *testing.T) {

httpProxyUrl := "http://192.168.50.233:10807"
_, err := GetADBlock(httpProxyUrl)
_, err := GetADBlock("C:\\Tmp", httpProxyUrl)
if err != nil {
t.Fatal(err)
}
Expand Down
274 changes: 274 additions & 0 deletions fake_user_agent.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
package rod_helper

import (
"github.com/WQGroup/logger"
"github.com/go-rod/rod"
"github.com/go-rod/rod/lib/proto"
"github.com/pkg/errors"
"math/rand"
"os"
"path/filepath"
"time"
)

func GetFakeUserAgentDataCache(tmpRootFolder, httpProxyURL string) error {

/*
暂时只获取:
1. Browsers
子分类中,再细化一点,只获取:
// 桌面浏览器
1. Chrome
2. Edge
3. Firefox
4. Opera
5. Safari
6. Mozilla
*/
// 直接查找所有的 A 的链接,然后读取 href 信息,匹配 <a href="/pages/Chrome/ " class="unterMenuName">Chrome</a>
nowBrowser, err := NewBrowserBase(tmpRootFolder, "", httpProxyURL, true, false)
if err != nil {
return err
}
defer nowBrowser.Close()
var nowPage *rod.Page
nowPage, err = NewPage(nowBrowser.Browser)
if err != nil {
return err
}
defer func() {
_ = nowPage.Close()
}()

err = parseUAAllPage(nowPage)
if err != nil {
return err
}

return nil
}

func parseUAAllPage(nowPage *rod.Page) error {

// 所有的 UA 的 SubType 都在这里
const allInfoPage = "https://useragentstring.com/pages/useragentstring.php"
var err error
var p *proto.NetworkResponseReceived
nowPage, p, err = PageNavigate(nowPage, allInfoPage, 15*time.Second)
if err != nil {
return err
}
statusCode := StatusCodeInfo{
Codes: []int{403},
Operator: Match,
WillDo: Skip,
NeedPunishment: false,
}
StatusCodeCheck, err := PageStatusCodeCheck(
p,
[]StatusCodeInfo{statusCode})
if err != nil {
return err
}
switch StatusCodeCheck {
case Skip, Repeat:
// 跳过后续的逻辑,不需要再次访问
return errors.New("StatusCodeCheck Error")
}
pageAllXPath := "//*[@id=\"menu\"]/a[2]"
pageLoaded := HasPageLoaded(nowPage, pageAllXPath, 15)
if pageLoaded == false {
return errors.New("HasPageLoaded == false")
}

uaUrlMap := make(map[string][]string, 0)
uaResultMap := make(map[string][]string, 0)

err = rod.Try(func() {
// 遍历所有的 A 的链接,然后读取 href 信息,匹配 <a href="/pages/Chrome/ " class="unterMenuName">Chrome</a>
aEls := nowPage.MustElements("a")
for i, el := range aEls {

elString := el.MustText()
if isSupportUAName(elString) == false {
continue
}
nowElUrlPath := el.MustProperty("href")
logger.Infoln(i, elString, nowElUrlPath)
_, found := uaUrlMap[elString]
if found == false {
uaUrlMap[elString] = make([]string, 0)
}
uaUrlMap[elString] = append(uaUrlMap[elString], nowElUrlPath.String())
}
})
if err != nil {
return err
}

for uaName, uaUrls := range uaUrlMap {

uaResultMap[uaName] = make([]string, 0)
for index, uaUrl := range uaUrls {

logger.Infoln(uaName, index, uaUrl)
nowPage, p, err = PageNavigate(nowPage, uaUrl, 15*time.Second)
if err != nil {
return err
}
StatusCodeCheck, err = PageStatusCodeCheck(
p,
[]StatusCodeInfo{statusCode})
if err != nil {
return err
}
switch StatusCodeCheck {
case Skip, Repeat:
// 跳过后续的逻辑,不需要再次访问
return errors.New("StatusCodeCheck Error")
}
pageLoaded = HasPageLoaded(nowPage, pageAllXPath, 15)
if pageLoaded == false {
return errors.New("HasPageLoaded == false")
}
err = rod.Try(func() {
// 遍历所有的 ul ,然后再次遍历 ul 中的 A 的链接
aULs := nowPage.MustElements("ul")
for _, ul := range aULs {

aEls := ul.MustElements("a")
for _, aEl := range aEls {

uaString := aEl.MustText()
uaResultMap[uaName] = append(uaResultMap[uaName], uaString)
}
}
})
if err != nil {
return err
}
}
}
// 当前的目录下缓存下来
saveRootPath := filepath.Join(".", "cache", "ua")
if IsDir(saveRootPath) == false {
err = os.MkdirAll(saveRootPath, os.ModePerm)
if err != nil {
return err
}
}
// 根据查询到的结果,写入本地的缓存
for uaName, results := range uaResultMap {
nowInfo := UserAgentInfo{
UserAgentMainType: Browsers,
SubType: uaName,
UserAgents: results,
}

desSaveFPath := filepath.Join(saveRootPath, uaName+".json")
logger.Infoln("uaName:", uaName, desSaveFPath)
err = ToFile(desSaveFPath, nowInfo)
if err != nil {
return err
}
}

return nil
}

func RandomUserAgent(tmpRootFolder, httpProxyURL string) string {

var err error
// 是否已经读取过本地的缓存
if len(allUANames) > 0 {
return allUANames[rand.Intn(len(allUANames))]
}
// 查看本地是否有缓存
uaRootPath := filepath.Join(".", "cache", "ua")
if IsDir(uaRootPath) == false {
err = GetFakeUserAgentDataCache(tmpRootFolder, httpProxyURL)
if err != nil {
logger.Panicln(err)
}
}
for i, subType := range subTypes {
uaFilePath := filepath.Join(uaRootPath, subType+".json")
if IsFile(uaFilePath) == false {
err = GetFakeUserAgentDataCache(tmpRootFolder, httpProxyURL)
if err != nil {
logger.Panicln(err)
}
}
uaInfo := UserAgentInfo{}
err = ToStruct(uaFilePath, &uaInfo)
if err != nil {
logger.Panicln(err)
}
allUANames = append(allUANames, uaInfo.UserAgents...)
logger.Infoln(i, subType, len(uaInfo.UserAgents))
}

// 是否已经读取过本地的缓存
if len(allUANames) > 0 {
return allUANames[rand.Intn(len(allUANames))]
} else {
logger.Panicln("RandomUserAgent is empty")
}

return ""
}

type UserAgentInfo struct {
UserAgentMainType UserAgentMainType // 主要的分类
SubType string // 比如是浏览器的分类中,可以是 Chrome 或者是 Firefox
UserAgents []string // 这个子分类中有那些 UserAgent
}

type UserAgentMainType int

const (
All UserAgentMainType = iota + 1
Crawlers
Browsers
MobileBrowsers
Consoles
OfflineBrowsers
EMailClients
LinkCheckers
EMailCollectors
Validators
FeedReaders
Libraries
CloudPlatforms
Ohters
)

const (
Chrome = "Chrome"
Edge = "Edge"
Firefox = "Firefox"
Opera = "Opera"
Safari = "Safari"
Mozilla = "Mozilla"
)

func isSupportUAName(inName string) bool {

switch inName {
case Chrome, Edge, Firefox, Opera, Safari, Mozilla:
return true
}
return false
}

var (
allUANames []string
subTypes = []string{
Chrome,
Edge,
Firefox,
Opera,
Safari,
Mozilla,
}
)
11 changes: 11 additions & 0 deletions fake_user_agent_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package rod_helper

import "testing"

func TestGetFakeUserAgentDataCache(t *testing.T) {

err := GetFakeUserAgentDataCache("C:\\Tmp", "http://192.168.50.252:20171")
if err != nil {
t.Fatal(err)
}
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ require (
github.com/golang/mock v1.6.0 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/go-cmp v0.5.8 // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/jonboulle/clockwork v0.3.0 // indirect
Expand Down
2 changes: 1 addition & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
Expand Down
66 changes: 66 additions & 0 deletions http_client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package rod_helper

import (
"crypto/tls"
"github.com/go-resty/resty/v2"
"net/http"
"time"
)

// NewHttpClient 新建一个 resty 的对象
func NewHttpClient(opt *HttpClientOptions) (*resty.Client, error) {
//const defUserAgent = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
//const defUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41"

var UserAgent string
// ------------------------------------------------
// 随机的 Browser
UserAgent = RandomUserAgent(opt.TmpRootFolder, opt.HttpProxyUrl)
// ------------------------------------------------
httpClient := resty.New().SetTransport(&http.Transport{
DisableKeepAlives: true,
MaxIdleConns: 1000,
MaxIdleConnsPerHost: 1000,
})
httpClient.SetTimeout(opt.HTMLTimeOut)
httpClient.SetRetryCount(1)
// ------------------------------------------------
// 设置 Referer
if len(opt.Referer) > 0 {
httpClient.SetHeader("Referer", opt.Referer)
}
// ------------------------------------------------
// 设置 Header
httpClient.SetHeaders(map[string]string{
"Content-Type": "application/json",
"User-Agent": UserAgent,
})
// ------------------------------------------------
// 不要求安全链接
httpClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
// ------------------------------------------------
// http 代理
if opt.HttpProxyUrl != "" {
httpClient.SetProxy(opt.HttpProxyUrl)
} else {
httpClient.RemoveProxy()
}

return httpClient, nil
}

type HttpClientOptions struct {
TmpRootFolder string
HTMLTimeOut time.Duration
HttpProxyUrl string
Referer string
}

func NewHttpClientOptions(tmpRootFolder string, HTMLTimeOut time.Duration, httpProxyUrl string, referer string) *HttpClientOptions {
return &HttpClientOptions{
TmpRootFolder: tmpRootFolder,
HTMLTimeOut: HTMLTimeOut,
HttpProxyUrl: httpProxyUrl,
Referer: referer,
}
}
Loading

0 comments on commit dd35373

Please sign in to comment.