diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java index d0f88a33e..e1da1e57e 100644 --- a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java +++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java @@ -20,9 +20,16 @@ package edu.uci.ics.crawler4j.crawler.authentication; import java.net.MalformedURLException; +import java.util.Map; import javax.swing.text.html.FormSubmitEvent.MethodType; +import org.apache.hc.client5.http.auth.AuthScope; +import org.apache.hc.client5.http.auth.Credentials; +import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * Created by Avi Hayun on 11/25/2014. * @@ -37,8 +44,10 @@ * * */ -public class BasicAuthInfo extends AuthInfo { - +public class BasicAuthInfo extends AuthInfo implements CredentialsProvider { + + private static final Logger LOGGER = LoggerFactory.getLogger(BasicAuthInfo.class); + /** * Constructor * @@ -53,4 +62,17 @@ public BasicAuthInfo(String username, String password, String loginUrl) super(AuthenticationType.BASIC_AUTHENTICATION, MethodType.GET, loginUrl, username, password); } + + + + /** + * BASIC authentication
+ * Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org + * /apache/http/examples/client/ClientAuthentication.java + */ + public void addCredentials(Map credentialsMap) { + LOGGER.info("BASIC authentication for: {}", getLoginTarget()); + Credentials credentials = new UsernamePasswordCredentials(getUsername(), getPassword().toCharArray()); + credentialsMap.put(new AuthScope(getHost(), getPort()), credentials); + } } diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/CredentialsProvider.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/CredentialsProvider.java new file mode 100644 index 000000000..2ccb8eb83 --- /dev/null +++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/CredentialsProvider.java @@ -0,0 +1,12 @@ +package edu.uci.ics.crawler4j.crawler.authentication; + +import java.util.Map; + +import org.apache.hc.client5.http.auth.AuthScope; +import org.apache.hc.client5.http.auth.Credentials; + +public interface CredentialsProvider { + + void addCredentials(Map credentialsMap); + +} diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java index fe4aa17b0..c2346051e 100644 --- a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java +++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java @@ -19,10 +19,23 @@ */ package edu.uci.ics.crawler4j.crawler.authentication; +import java.io.IOException; import java.net.MalformedURLException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; import javax.swing.text.html.FormSubmitEvent.MethodType; +import org.apache.hc.client5.http.ClientProtocolException; +import org.apache.hc.client5.http.classic.methods.HttpPost; +import org.apache.hc.client5.http.entity.UrlEncodedFormEntity; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.core5.http.NameValuePair; +import org.apache.hc.core5.http.message.BasicNameValuePair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * Created by Avi Hayun on 11/25/2014. * @@ -34,7 +47,9 @@ * username and password into an HTML form */ public class FormAuthInfo extends AuthInfo { - + + private static final Logger LOGGER = LoggerFactory.getLogger(FormAuthInfo.class); + private String usernameFormStr; private String passwordFormStr; @@ -85,4 +100,38 @@ public String getPasswordFormStr() { public void setPasswordFormStr(String passwordFormStr) { this.passwordFormStr = passwordFormStr; } + + + /** + * FORM authentication
+ * Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org + * /apache/http/examples/client/ClientFormLogin.java + */ + public void doFormLogin(final CloseableHttpClient httpClient) { + LOGGER.info("FORM authentication for: {}", getLoginTarget()); + String fullUri = getProtocol() + "://" + getHost() + ":" + getPort() + getLoginTarget(); + HttpPost httpPost = new HttpPost(fullUri); + List formParams = createFormParams(); + UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, StandardCharsets.UTF_8); + httpPost.setEntity(entity); + + try { + httpClient.execute(httpPost); + LOGGER.debug("Successfully request to login in with user: {} to: {}", getUsername(), getHost()); + } catch (ClientProtocolException e) { + LOGGER.error("While trying to login to: {} - Client protocol not supported", getHost(), e); + } catch (IOException e) { + LOGGER.error("While trying to login to: {} - Error making request", getHost(), e); + } + } + + /** + * Open for extension. + */ + protected List createFormParams() { + List formParams = new ArrayList<>(); + formParams.add(new BasicNameValuePair(getUsernameFormStr(), getUsername())); + formParams.add(new BasicNameValuePair(getPasswordFormStr(), getPassword())); + return formParams; + } } diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java index 982cd2fac..fa4368eb9 100644 --- a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java +++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java @@ -19,14 +19,26 @@ */ package edu.uci.ics.crawler4j.crawler.authentication; +import java.net.InetAddress; import java.net.MalformedURLException; +import java.net.UnknownHostException; +import java.util.Map; import javax.swing.text.html.FormSubmitEvent.MethodType; +import org.apache.hc.client5.http.auth.AuthScope; +import org.apache.hc.client5.http.auth.Credentials; +import org.apache.hc.client5.http.auth.NTCredentials; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * Authentication information for Microsoft Active Directory */ -public class NtAuthInfo extends AuthInfo { +public class NtAuthInfo extends AuthInfo implements CredentialsProvider { + + private static final Logger LOGGER = LoggerFactory.getLogger(NtAuthInfo.class); + private String domain; public NtAuthInfo(String username, String password, String loginUrl, String domain) @@ -42,4 +54,21 @@ public String getDomain() { public void setDomain(String domain) { this.domain = domain; } + + + + /** + * Do NT auth for Microsoft AD sites. + */ + public void addCredentials(Map credentialsMap) { + LOGGER.info("NT authentication for: {}", getLoginTarget()); + try { + Credentials credentials = new NTCredentials(getUsername(), + getPassword().toCharArray(), InetAddress.getLocalHost().getHostName(), + getDomain()); + credentialsMap.put(new AuthScope(getHost(), getPort()), credentials); + } catch (UnknownHostException e) { + LOGGER.error("Error creating NT credentials", e); + } + } } diff --git a/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java b/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java index c6aa274db..e0ae381f2 100644 --- a/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java +++ b/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java @@ -20,33 +20,22 @@ package edu.uci.ics.crawler4j.fetcher; import java.io.IOException; -import java.net.InetAddress; import java.net.URISyntaxException; -import java.net.UnknownHostException; -import java.nio.charset.StandardCharsets; import java.security.KeyManagementException; import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; -import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.net.ssl.SSLContext; -import crawlercommons.filters.basic.BasicURLNormalizer; -import edu.uci.ics.crawler4j.PolitenessServer; -import edu.uci.ics.crawler4j.fetcher.politeness.CachedPolitenessServer; -import org.apache.hc.client5.http.ClientProtocolException; import org.apache.hc.client5.http.auth.AuthScope; import org.apache.hc.client5.http.auth.Credentials; -import org.apache.hc.client5.http.auth.NTCredentials; import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; import org.apache.hc.client5.http.classic.methods.HttpGet; -import org.apache.hc.client5.http.classic.methods.HttpPost; import org.apache.hc.client5.http.classic.methods.HttpUriRequest; import org.apache.hc.client5.http.config.RequestConfig; -import org.apache.hc.client5.http.entity.UrlEncodedFormEntity; import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider; import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; @@ -60,21 +49,21 @@ import org.apache.hc.core5.http.HttpHeaders; import org.apache.hc.core5.http.HttpHost; import org.apache.hc.core5.http.HttpStatus; -import org.apache.hc.core5.http.NameValuePair; import org.apache.hc.core5.http.config.Registry; import org.apache.hc.core5.http.config.RegistryBuilder; -import org.apache.hc.core5.http.message.BasicNameValuePair; import org.apache.hc.core5.ssl.SSLContexts; import org.apache.hc.core5.util.Timeout; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import crawlercommons.filters.basic.BasicURLNormalizer; +import edu.uci.ics.crawler4j.PolitenessServer; import edu.uci.ics.crawler4j.crawler.CrawlConfig; import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo; -import edu.uci.ics.crawler4j.crawler.authentication.BasicAuthInfo; +import edu.uci.ics.crawler4j.crawler.authentication.CredentialsProvider; import edu.uci.ics.crawler4j.crawler.authentication.FormAuthInfo; -import edu.uci.ics.crawler4j.crawler.authentication.NtAuthInfo; import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException; +import edu.uci.ics.crawler4j.fetcher.politeness.CachedPolitenessServer; import edu.uci.ics.crawler4j.url.UrlResolver; import edu.uci.ics.crawler4j.url.WebURL; @@ -159,10 +148,9 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness List authInfos = config.getAuthInfos(); if (authInfos != null) { for (AuthInfo authInfo : authInfos) { - if (AuthInfo.AuthenticationType.BASIC_AUTHENTICATION.equals(authInfo.getAuthenticationType())) { - addBasicCredentials((BasicAuthInfo) authInfo, credentialsMap); - } else if (AuthInfo.AuthenticationType.NT_AUTHENTICATION.equals(authInfo.getAuthenticationType())) { - addNtCredentials((NtAuthInfo) authInfo, credentialsMap); + if (authInfo instanceof CredentialsProvider) { + CredentialsProvider credentialsProvider = (CredentialsProvider) authInfo; + credentialsProvider.addCredentials(credentialsMap); } } @@ -177,7 +165,7 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness .filter(info -> AuthInfo.AuthenticationType.FORM_AUTHENTICATION.equals(info.getAuthenticationType())) .map(FormAuthInfo.class::cast) - .forEach(this::doFormLogin); + .forEach(t -> t.doFormLogin(httpClient)); } else { httpClient = clientBuilder.build(); } @@ -188,66 +176,6 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness connectionMonitorThread.start(); } - /** - * BASIC authentication
- * Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org - * /apache/http/examples/client/ClientAuthentication.java - */ - private void addBasicCredentials(BasicAuthInfo authInfo, - Map credentialsMap) { - logger.info("BASIC authentication for: {}", authInfo.getLoginTarget()); - Credentials credentials = new UsernamePasswordCredentials(authInfo.getUsername(), - authInfo.getPassword().toCharArray()); - credentialsMap.put(new AuthScope(authInfo.getHost(), authInfo.getPort()), credentials); - } - - /** - * Do NT auth for Microsoft AD sites. - */ - private void addNtCredentials(NtAuthInfo authInfo, Map credentialsMap) { - logger.info("NT authentication for: {}", authInfo.getLoginTarget()); - try { - Credentials credentials = new NTCredentials(authInfo.getUsername(), - authInfo.getPassword().toCharArray(), InetAddress.getLocalHost().getHostName(), - authInfo.getDomain()); - credentialsMap.put(new AuthScope(authInfo.getHost(), authInfo.getPort()), credentials); - } catch (UnknownHostException e) { - logger.error("Error creating NT credentials", e); - } - } - - /** - * FORM authentication
- * Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org - * /apache/http/examples/client/ClientFormLogin.java - */ - private void doFormLogin(FormAuthInfo authInfo) { - logger.info("FORM authentication for: {}", authInfo.getLoginTarget()); - String fullUri = - authInfo.getProtocol() + "://" + authInfo.getHost() + ":" + authInfo.getPort() + - authInfo.getLoginTarget(); - HttpPost httpPost = new HttpPost(fullUri); - List formParams = new ArrayList<>(); - formParams.add( - new BasicNameValuePair(authInfo.getUsernameFormStr(), authInfo.getUsername())); - formParams.add( - new BasicNameValuePair(authInfo.getPasswordFormStr(), authInfo.getPassword())); - UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, StandardCharsets.UTF_8); - httpPost.setEntity(entity); - - try { - httpClient.execute(httpPost); - logger.debug("Successfully request to login in with user: {} to: {}", authInfo.getUsername(), - authInfo.getHost()); - } catch (ClientProtocolException e) { - logger.error("While trying to login to: {} - Client protocol not supported", - authInfo.getHost(), e); - } catch (IOException e) { - logger.error("While trying to login to: {} - Error making request", authInfo.getHost(), - e); - } - } - public PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, PageBiggerThanMaxSizeException, URISyntaxException { // Getting URL, setting headers & content