diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..15206b920 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -49,7 +49,7 @@ public class Page { private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; @@ -108,6 +108,7 @@ public Json getJson() { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ + @Deprecated public void setHtml(Html html) { this.html = html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9fc286192..b73665ab2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -40,9 +40,9 @@ public class Request implements Serializable { /** * cookies for current url, if not set use Site's cookies */ - private Map cookies = new HashMap(); + private Map cookies = new HashMap<>(); - private Map headers = new HashMap(); + private Map headers = new HashMap<>(); /** * Priority of the request.
@@ -94,7 +94,7 @@ public T getExtra(String key) { public Request putExtra(String key, T value) { if (extras == null) { - extras = new HashMap(); + extras = new HashMap<>(); } extras.put(key, value); return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 488c81e77..273b0a30e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -14,7 +14,7 @@ */ public class ResultItems { - private Map fields = new LinkedHashMap(); + private Map fields = new LinkedHashMap<>(); private Request request; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4879b2825..9cbda0222 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -22,9 +22,9 @@ public class Site { private String userAgent; - private Map defaultCookies = new LinkedHashMap(); + private Map defaultCookies = new LinkedHashMap<>(); - private Map> cookies = new HashMap>(); + private Map> cookies = new HashMap<>(); private String charset; @@ -38,11 +38,11 @@ public class Site { private int timeOut = 5000; - private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet<>(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; - private Map headers = new HashMap(); + private Map headers = new HashMap<>(); private boolean useGzip = true; @@ -83,7 +83,7 @@ public Site addCookie(String name, String value) { */ public Site addCookie(String domain, String name, String value) { if (!cookies.containsKey(domain)){ - cookies.put(domain,new HashMap()); + cookies.put(domain,new HashMap<>()); } cookies.get(domain).put(name, value); return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5940e738d..925548147 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -62,7 +62,7 @@ public class Spider implements Runnable, Task { protected Downloader downloader; - protected List pipelines = new ArrayList(); + protected List pipelines = new ArrayList<>(); protected PageProcessor pageProcessor; @@ -86,11 +86,11 @@ public class Spider implements Runnable, Task { protected boolean exitWhenComplete = true; - protected final static int STAT_INIT = 0; + protected static final int STAT_INIT = 0; - protected final static int STAT_RUNNING = 1; + protected static final int STAT_RUNNING = 1; - protected final static int STAT_STOPPED = 2; + protected static final int STAT_STOPPED = 2; protected boolean spawnUrl = true; @@ -173,6 +173,7 @@ public Spider setUUID(String uuid) { * @param scheduler scheduler * @return this * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) + * @deprecated */ @Deprecated public Spider scheduler(Scheduler scheduler) { @@ -247,7 +248,7 @@ public Spider setPipelines(List pipelines) { * @return this */ public Spider clearPipeline() { - pipelines = new ArrayList(); + pipelines = new ArrayList<>(); return this; } @@ -438,7 +439,6 @@ private void onDownloadSuccess(Request request, Page page) { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { @@ -544,7 +544,7 @@ protected CollectorPipeline getCollectorPipeline() { public T get(String url) { List urls = WMCollections.newArrayList(url); List resultItemses = getAll(urls); - if (resultItemses != null && resultItemses.size() > 0) { + if (resultItemses != null && !(resultItemses.isEmpty())) { return resultItemses.get(0); } else { return null; @@ -677,7 +677,7 @@ public Status getStatus() { public enum Status { - Init(0), Running(1), Stopped(2); + INIT(0), RUNNING(1), STOPPED(2); private Status(int value) { this.value = value; @@ -696,7 +696,7 @@ public static Status fromValue(int value) { } } //default value - return Init; + return INIT; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d09..a71a7d876 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -32,7 +32,7 @@ public Html download(String url) { */ public Html download(String url, String charset) { Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); - return (Html) page.getHtml(); + return page.getHtml(); } protected void onSuccess(Request request) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e111..b2a39e910 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -33,7 +33,7 @@ public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); - private final Map httpClients = new HashMap(); + private final Map httpClients = new HashMap<>(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 7d3b30785..afd6f88b0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -16,6 +16,8 @@ */ public class HttpRequestBody implements Serializable { + private static final String ILL_ENC = "illegal encoding "; + private static final long serialVersionUID = 5659170945717023595L; public static abstract class ContentType { @@ -68,7 +70,7 @@ public static HttpRequestBody json(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } @@ -76,7 +78,7 @@ public static HttpRequestBody xml(String xml, String encoding) { try { return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } @@ -92,7 +94,7 @@ public static HttpRequestBody form(Map params, String encoding){ try { return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index ccf00a466..85852bcae 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -19,6 +19,7 @@ */ public abstract class CharsetUtils { + private static final String CHR = "charset"; private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { @@ -40,9 +41,9 @@ public static String detectCharset(String contentType, byte[] contentBytes) thro for (Element link : links) { // 2.1、html4.01 String metaContent = link.attr("content"); - String metaCharset = link.attr("charset"); - if (metaContent.indexOf("charset") != -1) { - metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + String metaCharset = link.attr(CHR); + if (metaContent.indexOf(CHR) != -1) { + metaContent = metaContent.substring(metaContent.indexOf(CHR), metaContent.length()); charset = metaContent.split("=")[1]; break; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java index 3f2de70c5..e2759e563 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java @@ -19,6 +19,7 @@ */ public class ZipCodePageProcessor implements PageProcessor { + private Site site = Site.me().setCharset("gb2312") .setSleepTime(100); diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index e1d9dd039..4de3e01ec 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -63,6 +63,7 @@ class WebDriverPool { * @throws IOException */ public void configure() throws IOException { + // Read config file sConfig = new Properties(); String configFile = DEFAULT_CONFIG_FILE;