Skip to content

Commit 71375cc

Browse files
committed
Minor: Adjusts code fragments to not rely on platform defaults
1 parent 87906c6 commit 71375cc

File tree

20 files changed

+54
-38
lines changed

20 files changed

+54
-38
lines changed

crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/util/Util.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
*/
2020
package edu.uci.ics.crawler4j.util;
2121

22+
import java.util.Locale;
23+
2224
/**
2325
* @author Yasser Ganjisaffar
2426
*/
@@ -69,20 +71,20 @@ public static long byteArray2Long(byte[] b) {
6971
}
7072

7173
public static boolean hasBinaryContent(String contentType) {
72-
String typeStr = (contentType != null) ? contentType.toLowerCase() : "";
74+
String typeStr = (contentType != null) ? contentType.toLowerCase(Locale.ROOT) : "";
7375

7476
return typeStr.contains("image") || typeStr.contains("audio") ||
7577
typeStr.contains("video") || typeStr.contains("application");
7678
}
7779

7880
public static boolean hasPlainTextContent(String contentType) {
79-
String typeStr = (contentType != null) ? contentType.toLowerCase() : "";
81+
String typeStr = (contentType != null) ? contentType.toLowerCase(Locale.ROOT) : "";
8082

8183
return typeStr.contains("text") && !typeStr.contains("html");
8284
}
8385

8486
public static boolean hasCssTextContent(String contentType) {
85-
String typeStr = (contentType != null) ? contentType.toLowerCase() : "";
87+
String typeStr = (contentType != null) ? contentType.toLowerCase(Locale.ROOT) : "";
8688

8789
return typeStr.contains("css");
8890
}

crawler4j-core/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.io.InputStream;
2424
import java.nio.charset.Charset;
2525
import java.nio.charset.StandardCharsets;
26+
import java.util.Locale;
2627

2728
import org.apache.hc.core5.http.ContentType;
2829
import org.apache.hc.core5.http.Header;
@@ -180,7 +181,7 @@ public void load(HttpEntity entity, int maxBytes) throws IOException {
180181
}
181182

182183
if (charset != null) {
183-
contentCharset = charset.displayName();
184+
contentCharset = charset.displayName(Locale.ROOT);
184185
}
185186

186187
contentData = toByteArray(entity, maxBytes);

crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
import org.apache.tika.parser.html.HtmlMapper;
2323

24+
import java.util.Locale;
25+
2426
/**
2527
* Maps all HTML tags (not ignore some of this)
2628
*
@@ -30,7 +32,7 @@ public class AllTagMapper implements HtmlMapper {
3032

3133
@Override
3234
public String mapSafeElement(String name) {
33-
return name.toLowerCase();
35+
return name.toLowerCase(Locale.ROOT);
3436
}
3537

3638
@Override
@@ -40,6 +42,6 @@ public boolean isDiscardElement(String name) {
4042

4143
@Override
4244
public String mapSafeAttribute(String elementName, String attributeName) {
43-
return attributeName.toLowerCase();
45+
return attributeName.toLowerCase(Locale.ROOT);
4446
}
4547
}

crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/BinaryParseData.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.io.InputStream;
2626
import java.io.OutputStream;
2727
import java.io.PrintStream;
28+
import java.nio.charset.StandardCharsets;
2829
import java.util.HashSet;
2930
import java.util.Set;
3031

@@ -102,7 +103,7 @@ private static TransformerHandler getTransformerHandler(OutputStream out, String
102103
transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
103104
}
104105

105-
transformerHandler.setResult(new StreamResult(new PrintStream(out)));
106+
transformerHandler.setResult(new StreamResult(new PrintStream(out, false, StandardCharsets.UTF_8)));
106107
return transformerHandler;
107108
}
108109

crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.ArrayList;
2323
import java.util.HashMap;
2424
import java.util.List;
25+
import java.util.Locale;
2526
import java.util.Map;
2627

2728
import org.xml.sax.Attributes;
@@ -52,7 +53,7 @@ private static class HtmlFactory {
5253
static {
5354
name2Element = new HashMap<>();
5455
for (Element element : Element.values()) {
55-
name2Element.put(element.toString().toLowerCase(), element);
56+
name2Element.put(element.toString().toLowerCase(Locale.ROOT), element);
5657
}
5758
}
5859

@@ -118,12 +119,12 @@ public void startElement(String uri, String localName, String qName, Attributes
118119

119120
String content = attributes.getValue("content");
120121
if ((equiv != null) && (content != null)) {
121-
equiv = equiv.toLowerCase();
122+
equiv = equiv.toLowerCase(Locale.ROOT);
122123
metaTags.put(equiv, content);
123124

124125
// http-equiv="refresh" content="0;URL=http://foo.bar/..."
125126
if ("refresh".equals(equiv) && (metaRefresh == null)) {
126-
int pos = content.toLowerCase().indexOf("url=");
127+
int pos = content.toLowerCase(Locale.ROOT).indexOf("url=");
127128
if (pos != -1) {
128129
metaRefresh = content.substring(pos + 4);
129130
addToOutgoingUrls(metaRefresh, localName);

crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import edu.uci.ics.crawler4j.util.Util;
3333

3434
import java.io.IOException;
35+
import java.nio.charset.StandardCharsets;
3536

3637
/**
3738
* @author Yasser Ganjisaffar
@@ -93,7 +94,7 @@ public void parse(Page page, String contextURL) throws NotAllowedContentExceptio
9394
try {
9495
CssParseData parseData = new CssParseData(factory, normalizer);
9596
if (page.getContentCharset() == null) {
96-
parseData.setTextContent(new String(page.getContentData()));
97+
parseData.setTextContent(new String(page.getContentData(), StandardCharsets.UTF_8));
9798
} else {
9899
parseData.setTextContent(
99100
new String(page.getContentData(), page.getContentCharset()));
@@ -108,7 +109,7 @@ public void parse(Page page, String contextURL) throws NotAllowedContentExceptio
108109
try {
109110
TextParseData parseData = new TextParseData();
110111
if (page.getContentCharset() == null) {
111-
parseData.setTextContent(new String(page.getContentData()));
112+
parseData.setTextContent(new String(page.getContentData(), StandardCharsets.UTF_8));
112113
} else {
113114
parseData.setTextContent(
114115
new String(page.getContentData(), page.getContentCharset()));

crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
import java.io.ByteArrayInputStream;
2323
import java.io.InputStream;
2424
import java.io.UnsupportedEncodingException;
25+
import java.nio.charset.StandardCharsets;
2526
import java.util.HashSet;
27+
import java.util.Locale;
2628
import java.util.Set;
2729

2830
import crawlercommons.filters.basic.BasicURLNormalizer;
@@ -93,7 +95,7 @@ public HtmlParseData parse(Page page, String contextURL) throws ParseException {
9395
parsedData.setOutgoingUrls(outgoingUrls);
9496

9597
if (page.getContentCharset() == null) {
96-
parsedData.setHtml(new String(page.getContentData()));
98+
parsedData.setHtml(new String(page.getContentData(), StandardCharsets.UTF_8));
9799
} else {
98100
parsedData.setHtml(new String(page.getContentData(), page.getContentCharset()));
99101
}
@@ -123,7 +125,7 @@ private Set<WebURL> getOutgoingUrls(String contextURL, HtmlContentHandler conten
123125
continue;
124126
}
125127

126-
String hrefLoweredCase = href.trim().toLowerCase();
128+
String hrefLoweredCase = href.trim().toLowerCase(Locale.ROOT);
127129
if (!hrefLoweredCase.contains("javascript:") &&
128130
!hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) {
129131
String url = normalizer.filter(UrlResolver.resolveUrl((contextURL == null) ? "" : contextURL, href));

crawler4j-core/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import java.net.URL;
4545
import java.net.UnknownHostException;
4646
import java.util.HashMap;
47+
import java.util.Locale;
4748
import java.util.Map;
4849

4950
public class RobotstxtServer {
@@ -70,7 +71,7 @@ public RobotstxtServer(RobotstxtConfig config, PageFetcher pageFetcher, WebURLFa
7071
}
7172

7273
private static String getHost(URL url) {
73-
return url.getHost().toLowerCase();
74+
return url.getHost().toLowerCase(Locale.ROOT);
7475
}
7576

7677
/**

crawler4j-core/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlOnly.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.security.KeyStoreException;
2626
import java.security.NoSuchAlgorithmException;
2727
import java.util.Date;
28+
import java.util.Locale;
2829

2930
import crawlercommons.filters.basic.BasicURLNormalizer;
3031
import edu.uci.ics.crawler4j.url.WebURL;
@@ -70,7 +71,7 @@ public PageFetchResult fetchPage(WebURL webUrl)
7071

7172
String contentType = response.containsHeader("Content-Type") ?
7273
response.getFirstHeader("Content-Type").getValue() : null;
73-
String typeStr = (contentType != null) ? contentType.toLowerCase() : "";
74+
String typeStr = (contentType != null) ? contentType.toLowerCase(Locale.ROOT) : "";
7475

7576
if (typeStr.equals("") || (typeStr.contains("text") && typeStr.contains("html"))) {
7677
return super.fetchPage(webUrl);

crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
*/
2020
package edu.uci.ics.crawler4j.examples.basic;
2121

22+
import java.util.Locale;
2223
import java.util.Set;
2324
import java.util.concurrent.atomic.AtomicInteger;
2425
import java.util.regex.Pattern;
@@ -52,7 +53,7 @@ public BasicCrawler(AtomicInteger numSeenImages) {
5253
*/
5354
@Override
5455
public boolean shouldVisit(Page referringPage, WebURL url) {
55-
String href = url.getURL().toLowerCase();
56+
String href = url.getURL().toLowerCase(Locale.ROOT);
5657
// Ignore the url if it has an extension that matches our defined set of image extensions.
5758
if (IMAGE_EXTENSIONS.matcher(href).matches()) {
5859
numSeenImages.incrementAndGet();

0 commit comments

Comments
 (0)