Skip to content

Commit

Permalink
upgrade version
Browse files Browse the repository at this point in the history
  • Loading branch information
xuxueli committed Oct 15, 2022
1 parent 327d24e commit ffe8174
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 30 deletions.
10 changes: 5 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
<selenium-java.version>4.5.0</selenium-java.version>
<phantomjsdriver.version>1.5.0</phantomjsdriver.version>

<slf4j-api.version>1.7.25</slf4j-api.version>
<junit.version>4.11</junit.version>
<slf4j-api.version>2.0.3</slf4j-api.version>
<junit-jupiter.version>5.8.2</junit-jupiter.version>
</properties>

<dependencies>
Expand Down Expand Up @@ -78,9 +78,9 @@

<!-- junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>${junit-jupiter.version}</version>
<scope>test</scope>
</dependency>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ public Document load(PageRequest pageRequest) {
if (pageRequest.getProxy() != null) {
InetSocketAddress address = (InetSocketAddress) pageRequest.getProxy().address();
boolean isSocks = pageRequest.getProxy().type() == Proxy.Type.SOCKS;
webClient.getOptions().setProxyConfig(new ProxyConfig(address.getHostName(), address.getPort(), isSocks));
String proxyScheme = null;
webClient.getOptions().setProxyConfig(new ProxyConfig(address.getHostName(), address.getPort(), proxyScheme, isSocks));
}

// 发出请求
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.Duration;
import java.util.Map;
import java.util.concurrent.TimeUnit;

/**
* "selenisum + phantomjs" page loader
Expand All @@ -40,17 +40,15 @@ public Document load(PageRequest pageRequest) {

// driver init
DesiredCapabilities dcaps = new DesiredCapabilities();
dcaps.setCapability(CapabilityType.ACCEPT_SSL_CERTS, !pageRequest.isValidateTLSCertificates());
dcaps.setCapability(CapabilityType.TAKES_SCREENSHOT, false);
dcaps.setCapability(CapabilityType.SUPPORTS_FINDING_BY_CSS, true);
dcaps.setJavascriptEnabled(true);
dcaps.setCapability(CapabilityType.ACCEPT_INSECURE_CERTS, !pageRequest.isValidateTLSCertificates());
//dcaps.setCapability(CapabilityType.TAKES_SCREENSHOT, false); // Deprecated
if (driverPath!=null && driverPath.trim().length()>0) {
dcaps.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, driverPath);
}

if (pageRequest.getProxy() != null) {
dcaps.setCapability(CapabilityType.ForSeleniumServer.AVOIDING_PROXY, true);
dcaps.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true);
/*dcaps.setCapability(CapabilityType.ForSeleniumServer.AVOIDING_PROXY, true); // Deprecated
dcaps.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true);*/
System.setProperty("http.nonProxyHosts", "localhost");
dcaps.setCapability(CapabilityType.PROXY, pageRequest.getProxy());
}
Expand All @@ -71,9 +69,9 @@ public Document load(PageRequest pageRequest) {
}
}

webDriver.manage().timeouts().implicitlyWait(pageRequest.getTimeoutMillis(), TimeUnit.MILLISECONDS);
webDriver.manage().timeouts().pageLoadTimeout(pageRequest.getTimeoutMillis(), TimeUnit.MILLISECONDS);
webDriver.manage().timeouts().setScriptTimeout(pageRequest.getTimeoutMillis(), TimeUnit.MILLISECONDS);
webDriver.manage().timeouts().implicitlyWait(Duration.ofMillis(pageRequest.getTimeoutMillis()));
webDriver.manage().timeouts().pageLoadTimeout(Duration.ofMillis(pageRequest.getTimeoutMillis()));
webDriver.manage().timeouts().setScriptTimeout(Duration.ofMillis(pageRequest.getTimeoutMillis()));

String pageSource = webDriver.getPageSource();
if (pageSource != null) {
Expand Down
34 changes: 32 additions & 2 deletions src/main/java/com/xuxueli/crawler/util/JsoupUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,14 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.HashSet;
import java.util.Set;

Expand Down Expand Up @@ -52,7 +59,9 @@ public static Document load(PageRequest pageRequest) {
conn.referrer(pageRequest.getReferrer());
}
conn.timeout(pageRequest.getTimeoutMillis());
conn.validateTLSCertificates(pageRequest.isValidateTLSCertificates());
if (pageRequest.isValidateTLSCertificates()) {
conn.sslSocketFactory(generateSSLSocketFactory());
}
conn.maxBodySize(0); // 取消默认1M限制

// 代理
Expand All @@ -73,6 +82,25 @@ public static Document load(PageRequest pageRequest) {
return null;
}
}
private static SSLSocketFactory generateSSLSocketFactory() {
TrustManager[] trustAllCerts = new TrustManager[]{new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
} };
try {
SSLContext sslContext = SSLContext.getInstance("SSL");
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
SSLSocketFactory result = sslContext.getSocketFactory();
return result;
} catch (KeyManagementException | NoSuchAlgorithmException e) {
throw new RuntimeException("Failed to create a SSL socket factory", e);
}
}


public static String loadPageSource(PageRequest pageRequest) {
Expand All @@ -98,7 +126,9 @@ public static String loadPageSource(PageRequest pageRequest) {
conn.referrer(pageRequest.getReferrer());
}
conn.timeout(pageRequest.getTimeoutMillis());
conn.validateTLSCertificates(pageRequest.isValidateTLSCertificates());
if (pageRequest.isValidateTLSCertificates()) {
conn.sslSocketFactory(generateSSLSocketFactory());
}
conn.maxBodySize(0); // 取消默认1M限制

// 代理
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest04.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ public static void main(String[] args) {

// 设置代理池
ProxyMaker proxyMaker = new RoundProxyMaker()
.addProxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("---", 80)));
.addProxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("{自定义代理信息}", 80)));

// 构造爬虫 (代理方式请求IP地址查询网IP138,可从页面响应确认代理是否生效)
XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls("http://2018.ip138.com/ic.asp")
.setUrls("http://pv.sohu.com/cityjson")
.setAllowSpread(false)
.setProxyMaker(proxyMaker)
.setPageParser(new PageParser<Object>() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ public void parse(Document html, Element pageVoElement, PageVo pageVo) {
if (proxyPool!=null && proxyPool.size()>0) {
for (PageVo pageVo: proxyPool) {
try {
Document html = JsoupUtil.load(new PageRequest("http://2018.ip138.com/ic.asp",
Document html = JsoupUtil.load(new PageRequest("http://pv.sohu.com/cityjson",
null,
null,
null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import com.xuxueli.crawler.conf.XxlCrawlerConf;
import com.xuxueli.crawler.util.FileUtil;
import org.junit.Test;
import org.junit.jupiter.api.Test;

/**
* page downloader test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import com.xuxueli.crawler.model.PageRequest;
import com.xuxueli.crawler.util.JsoupUtil;
import org.jsoup.nodes.Document;
import org.junit.Test;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package com.xuxueli.crawler.test.util;

import com.xuxueli.crawler.util.RegexUtil;
import org.junit.Assert;
import org.junit.Test;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

/**
* regex tool test
Expand All @@ -21,7 +21,7 @@ public void matchesTest(){
String url = "https://my.oschina.net/xuxueli/blog/690978";

boolean ret = RegexUtil.matches(regex, url);
Assert.assertTrue(ret);
Assertions.assertTrue(ret);
}

/**
Expand All @@ -32,7 +32,7 @@ public void isUrlTest(){
String url = "http://www.baidu.com/";

boolean ret = RegexUtil.isUrl(url);
Assert.assertTrue(ret);
Assertions.assertTrue(ret);
}

}
6 changes: 3 additions & 3 deletions src/test/java/com/xuxueli/crawler/test/util/UrlUtilTest.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package com.xuxueli.crawler.test.util;

import com.xuxueli.crawler.util.UrlUtil;
import org.junit.Assert;
import org.junit.Test;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

/**
* url tool test
Expand All @@ -19,7 +19,7 @@ public void isUrlTest(){
String url = "http://www.baidu.com/";

boolean ret = UrlUtil.isUrl(url);
Assert.assertTrue(ret);
Assertions.assertTrue(ret);
}

}

0 comments on commit ffe8174

Please sign in to comment.