diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java
index d0f88a33e..e1da1e57e 100644
--- a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java
+++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java
@@ -20,9 +20,16 @@
package edu.uci.ics.crawler4j.crawler.authentication;
import java.net.MalformedURLException;
+import java.util.Map;
import javax.swing.text.html.FormSubmitEvent.MethodType;
+import org.apache.hc.client5.http.auth.AuthScope;
+import org.apache.hc.client5.http.auth.Credentials;
+import org.apache.hc.client5.http.auth.UsernamePasswordCredentials;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* Created by Avi Hayun on 11/25/2014.
*
@@ -37,8 +44,10 @@
*
*
*/
-public class BasicAuthInfo extends AuthInfo {
-
+public class BasicAuthInfo extends AuthInfo implements CredentialsProvider {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(BasicAuthInfo.class);
+
/**
* Constructor
*
@@ -53,4 +62,17 @@ public BasicAuthInfo(String username, String password, String loginUrl)
super(AuthenticationType.BASIC_AUTHENTICATION, MethodType.GET, loginUrl, username,
password);
}
+
+
+
+ /**
+ * BASIC authentication
+ * Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org
+ * /apache/http/examples/client/ClientAuthentication.java
+ */
+ public void addCredentials(Map credentialsMap) {
+ LOGGER.info("BASIC authentication for: {}", getLoginTarget());
+ Credentials credentials = new UsernamePasswordCredentials(getUsername(), getPassword().toCharArray());
+ credentialsMap.put(new AuthScope(getHost(), getPort()), credentials);
+ }
}
diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/CredentialsProvider.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/CredentialsProvider.java
new file mode 100644
index 000000000..2ccb8eb83
--- /dev/null
+++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/CredentialsProvider.java
@@ -0,0 +1,12 @@
+package edu.uci.ics.crawler4j.crawler.authentication;
+
+import java.util.Map;
+
+import org.apache.hc.client5.http.auth.AuthScope;
+import org.apache.hc.client5.http.auth.Credentials;
+
+public interface CredentialsProvider {
+
+ void addCredentials(Map credentialsMap);
+
+}
diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java
index fe4aa17b0..c2346051e 100644
--- a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java
+++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java
@@ -19,10 +19,23 @@
*/
package edu.uci.ics.crawler4j.crawler.authentication;
+import java.io.IOException;
import java.net.MalformedURLException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
import javax.swing.text.html.FormSubmitEvent.MethodType;
+import org.apache.hc.client5.http.ClientProtocolException;
+import org.apache.hc.client5.http.classic.methods.HttpPost;
+import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.core5.http.NameValuePair;
+import org.apache.hc.core5.http.message.BasicNameValuePair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* Created by Avi Hayun on 11/25/2014.
*
@@ -34,7 +47,9 @@
* username and password into an HTML form
*/
public class FormAuthInfo extends AuthInfo {
-
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(FormAuthInfo.class);
+
private String usernameFormStr;
private String passwordFormStr;
@@ -85,4 +100,38 @@ public String getPasswordFormStr() {
public void setPasswordFormStr(String passwordFormStr) {
this.passwordFormStr = passwordFormStr;
}
+
+
+ /**
+ * FORM authentication
+ * Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org
+ * /apache/http/examples/client/ClientFormLogin.java
+ */
+ public void doFormLogin(final CloseableHttpClient httpClient) {
+ LOGGER.info("FORM authentication for: {}", getLoginTarget());
+ String fullUri = getProtocol() + "://" + getHost() + ":" + getPort() + getLoginTarget();
+ HttpPost httpPost = new HttpPost(fullUri);
+ List formParams = createFormParams();
+ UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, StandardCharsets.UTF_8);
+ httpPost.setEntity(entity);
+
+ try {
+ httpClient.execute(httpPost);
+ LOGGER.debug("Successfully request to login in with user: {} to: {}", getUsername(), getHost());
+ } catch (ClientProtocolException e) {
+ LOGGER.error("While trying to login to: {} - Client protocol not supported", getHost(), e);
+ } catch (IOException e) {
+ LOGGER.error("While trying to login to: {} - Error making request", getHost(), e);
+ }
+ }
+
+ /**
+ * Open for extension.
+ */
+ protected List createFormParams() {
+ List formParams = new ArrayList<>();
+ formParams.add(new BasicNameValuePair(getUsernameFormStr(), getUsername()));
+ formParams.add(new BasicNameValuePair(getPasswordFormStr(), getPassword()));
+ return formParams;
+ }
}
diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java
index 982cd2fac..fa4368eb9 100644
--- a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java
+++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java
@@ -19,14 +19,26 @@
*/
package edu.uci.ics.crawler4j.crawler.authentication;
+import java.net.InetAddress;
import java.net.MalformedURLException;
+import java.net.UnknownHostException;
+import java.util.Map;
import javax.swing.text.html.FormSubmitEvent.MethodType;
+import org.apache.hc.client5.http.auth.AuthScope;
+import org.apache.hc.client5.http.auth.Credentials;
+import org.apache.hc.client5.http.auth.NTCredentials;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* Authentication information for Microsoft Active Directory
*/
-public class NtAuthInfo extends AuthInfo {
+public class NtAuthInfo extends AuthInfo implements CredentialsProvider {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(NtAuthInfo.class);
+
private String domain;
public NtAuthInfo(String username, String password, String loginUrl, String domain)
@@ -42,4 +54,21 @@ public String getDomain() {
public void setDomain(String domain) {
this.domain = domain;
}
+
+
+
+ /**
+ * Do NT auth for Microsoft AD sites.
+ */
+ public void addCredentials(Map credentialsMap) {
+ LOGGER.info("NT authentication for: {}", getLoginTarget());
+ try {
+ Credentials credentials = new NTCredentials(getUsername(),
+ getPassword().toCharArray(), InetAddress.getLocalHost().getHostName(),
+ getDomain());
+ credentialsMap.put(new AuthScope(getHost(), getPort()), credentials);
+ } catch (UnknownHostException e) {
+ LOGGER.error("Error creating NT credentials", e);
+ }
+ }
}
diff --git a/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java b/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
index c6aa274db..e0ae381f2 100644
--- a/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
+++ b/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
@@ -20,33 +20,22 @@
package edu.uci.ics.crawler4j.fetcher;
import java.io.IOException;
-import java.net.InetAddress;
import java.net.URISyntaxException;
-import java.net.UnknownHostException;
-import java.nio.charset.StandardCharsets;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
-import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.net.ssl.SSLContext;
-import crawlercommons.filters.basic.BasicURLNormalizer;
-import edu.uci.ics.crawler4j.PolitenessServer;
-import edu.uci.ics.crawler4j.fetcher.politeness.CachedPolitenessServer;
-import org.apache.hc.client5.http.ClientProtocolException;
import org.apache.hc.client5.http.auth.AuthScope;
import org.apache.hc.client5.http.auth.Credentials;
-import org.apache.hc.client5.http.auth.NTCredentials;
import org.apache.hc.client5.http.auth.UsernamePasswordCredentials;
import org.apache.hc.client5.http.classic.methods.HttpGet;
-import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.classic.methods.HttpUriRequest;
import org.apache.hc.client5.http.config.RequestConfig;
-import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
@@ -60,21 +49,21 @@
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.HttpHost;
import org.apache.hc.core5.http.HttpStatus;
-import org.apache.hc.core5.http.NameValuePair;
import org.apache.hc.core5.http.config.Registry;
import org.apache.hc.core5.http.config.RegistryBuilder;
-import org.apache.hc.core5.http.message.BasicNameValuePair;
import org.apache.hc.core5.ssl.SSLContexts;
import org.apache.hc.core5.util.Timeout;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import crawlercommons.filters.basic.BasicURLNormalizer;
+import edu.uci.ics.crawler4j.PolitenessServer;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;
-import edu.uci.ics.crawler4j.crawler.authentication.BasicAuthInfo;
+import edu.uci.ics.crawler4j.crawler.authentication.CredentialsProvider;
import edu.uci.ics.crawler4j.crawler.authentication.FormAuthInfo;
-import edu.uci.ics.crawler4j.crawler.authentication.NtAuthInfo;
import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
+import edu.uci.ics.crawler4j.fetcher.politeness.CachedPolitenessServer;
import edu.uci.ics.crawler4j.url.UrlResolver;
import edu.uci.ics.crawler4j.url.WebURL;
@@ -159,10 +148,9 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness
List authInfos = config.getAuthInfos();
if (authInfos != null) {
for (AuthInfo authInfo : authInfos) {
- if (AuthInfo.AuthenticationType.BASIC_AUTHENTICATION.equals(authInfo.getAuthenticationType())) {
- addBasicCredentials((BasicAuthInfo) authInfo, credentialsMap);
- } else if (AuthInfo.AuthenticationType.NT_AUTHENTICATION.equals(authInfo.getAuthenticationType())) {
- addNtCredentials((NtAuthInfo) authInfo, credentialsMap);
+ if (authInfo instanceof CredentialsProvider) {
+ CredentialsProvider credentialsProvider = (CredentialsProvider) authInfo;
+ credentialsProvider.addCredentials(credentialsMap);
}
}
@@ -177,7 +165,7 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness
.filter(info ->
AuthInfo.AuthenticationType.FORM_AUTHENTICATION.equals(info.getAuthenticationType()))
.map(FormAuthInfo.class::cast)
- .forEach(this::doFormLogin);
+ .forEach(t -> t.doFormLogin(httpClient));
} else {
httpClient = clientBuilder.build();
}
@@ -188,66 +176,6 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness
connectionMonitorThread.start();
}
- /**
- * BASIC authentication
- * Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org
- * /apache/http/examples/client/ClientAuthentication.java
- */
- private void addBasicCredentials(BasicAuthInfo authInfo,
- Map credentialsMap) {
- logger.info("BASIC authentication for: {}", authInfo.getLoginTarget());
- Credentials credentials = new UsernamePasswordCredentials(authInfo.getUsername(),
- authInfo.getPassword().toCharArray());
- credentialsMap.put(new AuthScope(authInfo.getHost(), authInfo.getPort()), credentials);
- }
-
- /**
- * Do NT auth for Microsoft AD sites.
- */
- private void addNtCredentials(NtAuthInfo authInfo, Map credentialsMap) {
- logger.info("NT authentication for: {}", authInfo.getLoginTarget());
- try {
- Credentials credentials = new NTCredentials(authInfo.getUsername(),
- authInfo.getPassword().toCharArray(), InetAddress.getLocalHost().getHostName(),
- authInfo.getDomain());
- credentialsMap.put(new AuthScope(authInfo.getHost(), authInfo.getPort()), credentials);
- } catch (UnknownHostException e) {
- logger.error("Error creating NT credentials", e);
- }
- }
-
- /**
- * FORM authentication
- * Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org
- * /apache/http/examples/client/ClientFormLogin.java
- */
- private void doFormLogin(FormAuthInfo authInfo) {
- logger.info("FORM authentication for: {}", authInfo.getLoginTarget());
- String fullUri =
- authInfo.getProtocol() + "://" + authInfo.getHost() + ":" + authInfo.getPort() +
- authInfo.getLoginTarget();
- HttpPost httpPost = new HttpPost(fullUri);
- List formParams = new ArrayList<>();
- formParams.add(
- new BasicNameValuePair(authInfo.getUsernameFormStr(), authInfo.getUsername()));
- formParams.add(
- new BasicNameValuePair(authInfo.getPasswordFormStr(), authInfo.getPassword()));
- UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, StandardCharsets.UTF_8);
- httpPost.setEntity(entity);
-
- try {
- httpClient.execute(httpPost);
- logger.debug("Successfully request to login in with user: {} to: {}", authInfo.getUsername(),
- authInfo.getHost());
- } catch (ClientProtocolException e) {
- logger.error("While trying to login to: {} - Client protocol not supported",
- authInfo.getHost(), e);
- } catch (IOException e) {
- logger.error("While trying to login to: {} - Error making request", authInfo.getHost(),
- e);
- }
- }
-
public PageFetchResult fetchPage(WebURL webUrl)
throws InterruptedException, IOException, PageBiggerThanMaxSizeException, URISyntaxException {
// Getting URL, setting headers & content