Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move authentication logic out of the PageFetcher constructor to allow… #104

Merged
merged 2 commits into from
Aug 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,16 @@
package edu.uci.ics.crawler4j.crawler.authentication;

import java.net.MalformedURLException;
import java.util.Map;

import javax.swing.text.html.FormSubmitEvent.MethodType;

import org.apache.hc.client5.http.auth.AuthScope;
import org.apache.hc.client5.http.auth.Credentials;
import org.apache.hc.client5.http.auth.UsernamePasswordCredentials;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Created by Avi Hayun on 11/25/2014.
*
Expand All @@ -37,8 +44,10 @@
* </li>
* </ul>
*/
public class BasicAuthInfo extends AuthInfo {

public class BasicAuthInfo extends AuthInfo implements CredentialsProvider {

private static final Logger LOGGER = LoggerFactory.getLogger(BasicAuthInfo.class);

/**
* Constructor
*
Expand All @@ -53,4 +62,17 @@ public BasicAuthInfo(String username, String password, String loginUrl)
super(AuthenticationType.BASIC_AUTHENTICATION, MethodType.GET, loginUrl, username,
password);
}



/**
* BASIC authentication<br/>
* Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org
* /apache/http/examples/client/ClientAuthentication.java
*/
public void addCredentials(Map<AuthScope, Credentials> credentialsMap) {
LOGGER.info("BASIC authentication for: {}", getLoginTarget());
Credentials credentials = new UsernamePasswordCredentials(getUsername(), getPassword().toCharArray());
credentialsMap.put(new AuthScope(getHost(), getPort()), credentials);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package edu.uci.ics.crawler4j.crawler.authentication;

import java.util.Map;

import org.apache.hc.client5.http.auth.AuthScope;
import org.apache.hc.client5.http.auth.Credentials;

public interface CredentialsProvider {

void addCredentials(Map<AuthScope, Credentials> credentialsMap);

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,23 @@
*/
package edu.uci.ics.crawler4j.crawler.authentication;

import java.io.IOException;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

import javax.swing.text.html.FormSubmitEvent.MethodType;

import org.apache.hc.client5.http.ClientProtocolException;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.core5.http.NameValuePair;
import org.apache.hc.core5.http.message.BasicNameValuePair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Created by Avi Hayun on 11/25/2014.
*
Expand All @@ -34,7 +47,9 @@
* username and password into an HTML form
*/
public class FormAuthInfo extends AuthInfo {


private static final Logger LOGGER = LoggerFactory.getLogger(FormAuthInfo.class);

private String usernameFormStr;
private String passwordFormStr;

Expand Down Expand Up @@ -85,4 +100,38 @@ public String getPasswordFormStr() {
public void setPasswordFormStr(String passwordFormStr) {
this.passwordFormStr = passwordFormStr;
}


/**
* FORM authentication<br/>
* Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org
* /apache/http/examples/client/ClientFormLogin.java
*/
public void doFormLogin(final CloseableHttpClient httpClient) {
LOGGER.info("FORM authentication for: {}", getLoginTarget());
String fullUri = getProtocol() + "://" + getHost() + ":" + getPort() + getLoginTarget();
HttpPost httpPost = new HttpPost(fullUri);
List<NameValuePair> formParams = createFormParams();
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, StandardCharsets.UTF_8);
httpPost.setEntity(entity);

try {
httpClient.execute(httpPost);
LOGGER.debug("Successfully request to login in with user: {} to: {}", getUsername(), getHost());
} catch (ClientProtocolException e) {
LOGGER.error("While trying to login to: {} - Client protocol not supported", getHost(), e);
} catch (IOException e) {
LOGGER.error("While trying to login to: {} - Error making request", getHost(), e);
}
}

/**
* Open for extension.
*/
protected List<NameValuePair> createFormParams() {
List<NameValuePair> formParams = new ArrayList<>();
formParams.add(new BasicNameValuePair(getUsernameFormStr(), getUsername()));
formParams.add(new BasicNameValuePair(getPasswordFormStr(), getPassword()));
return formParams;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,26 @@
*/
package edu.uci.ics.crawler4j.crawler.authentication;

import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.util.Map;

import javax.swing.text.html.FormSubmitEvent.MethodType;

import org.apache.hc.client5.http.auth.AuthScope;
import org.apache.hc.client5.http.auth.Credentials;
import org.apache.hc.client5.http.auth.NTCredentials;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Authentication information for Microsoft Active Directory
*/
public class NtAuthInfo extends AuthInfo {
public class NtAuthInfo extends AuthInfo implements CredentialsProvider {

private static final Logger LOGGER = LoggerFactory.getLogger(NtAuthInfo.class);

private String domain;

public NtAuthInfo(String username, String password, String loginUrl, String domain)
Expand All @@ -42,4 +54,21 @@ public String getDomain() {
public void setDomain(String domain) {
this.domain = domain;
}



/**
* Do NT auth for Microsoft AD sites.
*/
public void addCredentials(Map<AuthScope, Credentials> credentialsMap) {
LOGGER.info("NT authentication for: {}", getLoginTarget());
try {
Credentials credentials = new NTCredentials(getUsername(),
getPassword().toCharArray(), InetAddress.getLocalHost().getHostName(),
getDomain());
credentialsMap.put(new AuthScope(getHost(), getPort()), credentials);
} catch (UnknownHostException e) {
LOGGER.error("Error creating NT credentials", e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,33 +20,22 @@
package edu.uci.ics.crawler4j.fetcher;

import java.io.IOException;
import java.net.InetAddress;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.nio.charset.StandardCharsets;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.net.ssl.SSLContext;

import crawlercommons.filters.basic.BasicURLNormalizer;
import edu.uci.ics.crawler4j.PolitenessServer;
import edu.uci.ics.crawler4j.fetcher.politeness.CachedPolitenessServer;
import org.apache.hc.client5.http.ClientProtocolException;
import org.apache.hc.client5.http.auth.AuthScope;
import org.apache.hc.client5.http.auth.Credentials;
import org.apache.hc.client5.http.auth.NTCredentials;
import org.apache.hc.client5.http.auth.UsernamePasswordCredentials;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.classic.methods.HttpUriRequest;
import org.apache.hc.client5.http.config.RequestConfig;
import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
Expand All @@ -60,21 +49,21 @@
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.HttpHost;
import org.apache.hc.core5.http.HttpStatus;
import org.apache.hc.core5.http.NameValuePair;
import org.apache.hc.core5.http.config.Registry;
import org.apache.hc.core5.http.config.RegistryBuilder;
import org.apache.hc.core5.http.message.BasicNameValuePair;
import org.apache.hc.core5.ssl.SSLContexts;
import org.apache.hc.core5.util.Timeout;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import crawlercommons.filters.basic.BasicURLNormalizer;
import edu.uci.ics.crawler4j.PolitenessServer;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;
import edu.uci.ics.crawler4j.crawler.authentication.BasicAuthInfo;
import edu.uci.ics.crawler4j.crawler.authentication.CredentialsProvider;
import edu.uci.ics.crawler4j.crawler.authentication.FormAuthInfo;
import edu.uci.ics.crawler4j.crawler.authentication.NtAuthInfo;
import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
import edu.uci.ics.crawler4j.fetcher.politeness.CachedPolitenessServer;
import edu.uci.ics.crawler4j.url.UrlResolver;
import edu.uci.ics.crawler4j.url.WebURL;

Expand Down Expand Up @@ -159,10 +148,9 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness
List<AuthInfo> authInfos = config.getAuthInfos();
if (authInfos != null) {
for (AuthInfo authInfo : authInfos) {
if (AuthInfo.AuthenticationType.BASIC_AUTHENTICATION.equals(authInfo.getAuthenticationType())) {
addBasicCredentials((BasicAuthInfo) authInfo, credentialsMap);
} else if (AuthInfo.AuthenticationType.NT_AUTHENTICATION.equals(authInfo.getAuthenticationType())) {
addNtCredentials((NtAuthInfo) authInfo, credentialsMap);
if (authInfo instanceof CredentialsProvider) {
CredentialsProvider credentialsProvider = (CredentialsProvider) authInfo;
credentialsProvider.addCredentials(credentialsMap);
}
}

Expand All @@ -177,7 +165,7 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness
.filter(info ->
AuthInfo.AuthenticationType.FORM_AUTHENTICATION.equals(info.getAuthenticationType()))
.map(FormAuthInfo.class::cast)
.forEach(this::doFormLogin);
.forEach(t -> t.doFormLogin(httpClient));
} else {
httpClient = clientBuilder.build();
}
Expand All @@ -188,66 +176,6 @@ public PageFetcher(CrawlConfig config, BasicURLNormalizer normalizer, Politeness
connectionMonitorThread.start();
}

/**
* BASIC authentication<br/>
* Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org
* /apache/http/examples/client/ClientAuthentication.java
*/
private void addBasicCredentials(BasicAuthInfo authInfo,
Map<AuthScope, Credentials> credentialsMap) {
logger.info("BASIC authentication for: {}", authInfo.getLoginTarget());
Credentials credentials = new UsernamePasswordCredentials(authInfo.getUsername(),
authInfo.getPassword().toCharArray());
credentialsMap.put(new AuthScope(authInfo.getHost(), authInfo.getPort()), credentials);
}

/**
* Do NT auth for Microsoft AD sites.
*/
private void addNtCredentials(NtAuthInfo authInfo, Map<AuthScope, Credentials> credentialsMap) {
logger.info("NT authentication for: {}", authInfo.getLoginTarget());
try {
Credentials credentials = new NTCredentials(authInfo.getUsername(),
authInfo.getPassword().toCharArray(), InetAddress.getLocalHost().getHostName(),
authInfo.getDomain());
credentialsMap.put(new AuthScope(authInfo.getHost(), authInfo.getPort()), credentials);
} catch (UnknownHostException e) {
logger.error("Error creating NT credentials", e);
}
}

/**
* FORM authentication<br/>
* Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org
* /apache/http/examples/client/ClientFormLogin.java
*/
private void doFormLogin(FormAuthInfo authInfo) {
logger.info("FORM authentication for: {}", authInfo.getLoginTarget());
String fullUri =
authInfo.getProtocol() + "://" + authInfo.getHost() + ":" + authInfo.getPort() +
authInfo.getLoginTarget();
HttpPost httpPost = new HttpPost(fullUri);
List<NameValuePair> formParams = new ArrayList<>();
formParams.add(
new BasicNameValuePair(authInfo.getUsernameFormStr(), authInfo.getUsername()));
formParams.add(
new BasicNameValuePair(authInfo.getPasswordFormStr(), authInfo.getPassword()));
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, StandardCharsets.UTF_8);
httpPost.setEntity(entity);

try {
httpClient.execute(httpPost);
logger.debug("Successfully request to login in with user: {} to: {}", authInfo.getUsername(),
authInfo.getHost());
} catch (ClientProtocolException e) {
logger.error("While trying to login to: {} - Client protocol not supported",
authInfo.getHost(), e);
} catch (IOException e) {
logger.error("While trying to login to: {} - Error making request", authInfo.getHost(),
e);
}
}

public PageFetchResult fetchPage(WebURL webUrl)
throws InterruptedException, IOException, PageBiggerThanMaxSizeException, URISyntaxException {
// Getting URL, setting headers & content
Expand Down