Skip to content

Commit

Permalink
samples: feat: adding hadoopfs and autoscaling samples (#3262)
Browse files Browse the repository at this point in the history
Adding a sample + test for submitting a HadoopFS command using the Java client library.

Adding a sample + test for creating a Dataproc cluster with autoscaling configured.

Follow-up to #2949
  • Loading branch information
bradmiro authored and chingor13 committed Aug 3, 2020
1 parent 351dfb5 commit 4f1ec2c
Show file tree
Hide file tree
Showing 5 changed files with 482 additions and 1 deletion.
174 changes: 174 additions & 0 deletions dataproc/snippets/src/main/java/CreateClusterWithAutoscaling.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/*
* Copyright 2020 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This sample creates a Dataproc cluster with an autoscaling policy enabled.
* The policy we will be creating mirrors the following YAML representation:
*
workerConfig:
minInstances: 2
maxInstances: 100
weight: 1
secondaryWorkerConfig:
minInstances: 0
maxInstances: 100
weight: 1
basicAlgorithm:
cooldownPeriod: 4m
yarnConfig:
scaleUpFactor: 0.05
scaleDownFactor: 1.0
scaleUpMinWorkerFraction: 0.0
scaleDownMinWorkerFraction: 0.0
gracefulDecommissionTimeout: 1h
*/

// [START dataproc_create_autoscaling_cluster]

import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.dataproc.v1.AutoscalingConfig;
import com.google.cloud.dataproc.v1.AutoscalingPolicy;
import com.google.cloud.dataproc.v1.AutoscalingPolicyServiceClient;
import com.google.cloud.dataproc.v1.AutoscalingPolicyServiceSettings;
import com.google.cloud.dataproc.v1.BasicAutoscalingAlgorithm;
import com.google.cloud.dataproc.v1.BasicYarnAutoscalingConfig;
import com.google.cloud.dataproc.v1.Cluster;
import com.google.cloud.dataproc.v1.ClusterConfig;
import com.google.cloud.dataproc.v1.ClusterControllerClient;
import com.google.cloud.dataproc.v1.ClusterControllerSettings;
import com.google.cloud.dataproc.v1.ClusterOperationMetadata;
import com.google.cloud.dataproc.v1.InstanceGroupAutoscalingPolicyConfig;
import com.google.cloud.dataproc.v1.InstanceGroupConfig;
import com.google.cloud.dataproc.v1.RegionName;
import com.google.protobuf.Duration;
import java.io.IOException;
import java.util.concurrent.ExecutionException;

public class CreateClusterWithAutoscaling {

public static void createClusterwithAutoscaling() throws IOException, InterruptedException {
// TODO(developer): Replace these variables before running the sample.
String projectId = "your-project-id";
String region = "your-project-region";
String clusterName = "your-cluster-name";
String autoscalingPolicyName = "your-autoscaling-policy";
createClusterwithAutoscaling(projectId, region, clusterName, autoscalingPolicyName);
}

public static void createClusterwithAutoscaling(
String projectId, String region, String clusterName, String autoscalingPolicyName)
throws IOException, InterruptedException {
String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);

// Configure the settings for the cluster controller client.
ClusterControllerSettings clusterControllerSettings =
ClusterControllerSettings.newBuilder().setEndpoint(myEndpoint).build();

// Configure the settings for the autoscaling policy service client.
AutoscalingPolicyServiceSettings autoscalingPolicyServiceSettings =
AutoscalingPolicyServiceSettings.newBuilder().setEndpoint(myEndpoint).build();

// Create a cluster controller client and an autoscaling controller client with the configured
// settings. The clients only need to be created once and can be reused for multiple requests.
// Using a
// try-with-resources closes the client, but this can also be done manually with the .close()
// method.
try (ClusterControllerClient clusterControllerClient =
ClusterControllerClient.create(clusterControllerSettings);
AutoscalingPolicyServiceClient autoscalingPolicyServiceClient =
AutoscalingPolicyServiceClient.create(autoscalingPolicyServiceSettings)) {

// Create the Autoscaling policy.
InstanceGroupAutoscalingPolicyConfig workerInstanceGroupAutoscalingPolicyConfig =
InstanceGroupAutoscalingPolicyConfig.newBuilder()
.setMinInstances(2)
.setMaxInstances(100)
.setWeight(1)
.build();
InstanceGroupAutoscalingPolicyConfig secondaryWorkerInstanceGroupAutoscalingPolicyConfig =
InstanceGroupAutoscalingPolicyConfig.newBuilder()
.setMinInstances(0)
.setMaxInstances(100)
.setWeight(1)
.build();
BasicYarnAutoscalingConfig basicYarnApplicationConfig =
BasicYarnAutoscalingConfig.newBuilder()
.setScaleUpFactor(0.05)
.setScaleDownFactor(1.0)
.setScaleUpMinWorkerFraction(0.0)
.setScaleUpMinWorkerFraction(0.0)
.setGracefulDecommissionTimeout(Duration.newBuilder().setSeconds(3600).build())
.build();
BasicAutoscalingAlgorithm basicAutoscalingAlgorithm =
BasicAutoscalingAlgorithm.newBuilder()
.setCooldownPeriod(Duration.newBuilder().setSeconds(240).build())
.setYarnConfig(basicYarnApplicationConfig)
.build();
AutoscalingPolicy autoscalingPolicy =
AutoscalingPolicy.newBuilder()
.setId(autoscalingPolicyName)
.setWorkerConfig(workerInstanceGroupAutoscalingPolicyConfig)
.setSecondaryWorkerConfig(secondaryWorkerInstanceGroupAutoscalingPolicyConfig)
.setBasicAlgorithm(basicAutoscalingAlgorithm)
.build();
RegionName parent = RegionName.of(projectId, region);

// Policy is uploaded here.
autoscalingPolicyServiceClient.createAutoscalingPolicy(parent, autoscalingPolicy);

// Now the policy can be referenced when creating a cluster.
String autoscalingPolicyUri =
String.format(
"projects/%s/locations/%s/autoscalingPolicies/%s",
projectId, region, autoscalingPolicyName);
AutoscalingConfig autoscalingConfig =
AutoscalingConfig.newBuilder().setPolicyUri(autoscalingPolicyUri).build();

// Configure the settings for our cluster.
InstanceGroupConfig masterConfig =
InstanceGroupConfig.newBuilder()
.setMachineTypeUri("n1-standard-1")
.setNumInstances(1)
.build();
InstanceGroupConfig workerConfig =
InstanceGroupConfig.newBuilder()
.setMachineTypeUri("n1-standard-1")
.setNumInstances(2)
.build();
ClusterConfig clusterConfig =
ClusterConfig.newBuilder()
.setMasterConfig(masterConfig)
.setWorkerConfig(workerConfig)
.setAutoscalingConfig(autoscalingConfig)
.build();

// Create the cluster object with the desired cluster config.
Cluster cluster =
Cluster.newBuilder().setClusterName(clusterName).setConfig(clusterConfig).build();

// Create the Dataproc cluster.
OperationFuture<Cluster, ClusterOperationMetadata> createClusterAsyncRequest =
clusterControllerClient.createClusterAsync(projectId, region, cluster);
Cluster response = createClusterAsyncRequest.get();

// Print out a success message.
System.out.printf("Cluster created successfully: %s", response.getClusterName());

} catch (ExecutionException e) {
// If cluster creation does not complete successfully, print the error message.
System.err.println(String.format("createClusterWithAutoscaling: %s ", e.getMessage()));
}
}
}
// [END dataproc_create_autoscaling_cluster]
101 changes: 101 additions & 0 deletions dataproc/snippets/src/main/java/SubmitHadoopFsJob.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright 2020 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// [START dataproc_submit_hadoop_fs_job]

import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.dataproc.v1.HadoopJob;
import com.google.cloud.dataproc.v1.Job;
import com.google.cloud.dataproc.v1.JobControllerClient;
import com.google.cloud.dataproc.v1.JobControllerSettings;
import com.google.cloud.dataproc.v1.JobMetadata;
import com.google.cloud.dataproc.v1.JobPlacement;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.concurrent.ExecutionException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SubmitHadoopFsJob {

public static ArrayList<String> stringToList(String s) {
return new ArrayList<>(Arrays.asList(s.split(" ")));
}

public static void submitHadoopFsJob() throws IOException, InterruptedException {
// TODO(developer): Replace these variables before running the sample.
String projectId = "your-project-id";
String region = "your-project-region";
String clusterName = "your-cluster-name";
String hadoopFsQuery = "your-hadoop-fs-query";
submitHadoopFsJob(projectId, region, clusterName, hadoopFsQuery);
}

public static void submitHadoopFsJob(
String projectId, String region, String clusterName, String hadoopFsQuery)
throws IOException, InterruptedException {
String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);

// Configure the settings for the job controller client.
JobControllerSettings jobControllerSettings =
JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build();

// Create a job controller client with the configured settings. Using a try-with-resources
// closes the client,
// but this can also be done manually with the .close() method.
try (JobControllerClient jobControllerClient =
JobControllerClient.create(jobControllerSettings)) {

// Configure cluster placement for the job.
JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();

// Configure Hadoop job settings. The HadoopFS query is set here.
HadoopJob hadoopJob =
HadoopJob.newBuilder()
.setMainClass("org.apache.hadoop.fs.FsShell")
.addAllArgs(stringToList(hadoopFsQuery))
.build();

Job job = Job.newBuilder().setPlacement(jobPlacement).setHadoopJob(hadoopJob).build();

// Submit an asynchronous request to execute the job.
OperationFuture<Job, JobMetadata> submitJobAsOperationAsyncRequest =
jobControllerClient.submitJobAsOperationAsync(projectId, region, job);

Job response = submitJobAsOperationAsyncRequest.get();

// Print output from Google Cloud Storage.
Matcher matches =
Pattern.compile("gs://(.*?)/(.*)").matcher(response.getDriverOutputResourceUri());
matches.matches();

Storage storage = StorageOptions.getDefaultInstance().getService();
Blob blob = storage.get(matches.group(1), String.format("%s.000000000", matches.group(2)));

System.out.println(
String.format("Job finished successfully: %s", new String(blob.getContent())));

} catch (ExecutionException e) {
// If the job does not complete successfully, print the error message.
System.err.println(String.format("submitHadoopFSJob: %s ", e.getMessage()));
}
}
}
// [END dataproc_submit_hadoop_fs_job]
105 changes: 105 additions & 0 deletions dataproc/snippets/src/test/java/CreateClusterWithAutoscalingTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright 2020 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import static junit.framework.TestCase.assertNotNull;
import static org.hamcrest.MatcherAssert.assertThat;

import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.dataproc.v1.AutoscalingPolicyName;
import com.google.cloud.dataproc.v1.AutoscalingPolicyServiceClient;
import com.google.cloud.dataproc.v1.AutoscalingPolicyServiceSettings;
import com.google.cloud.dataproc.v1.ClusterControllerClient;
import com.google.cloud.dataproc.v1.ClusterControllerSettings;
import com.google.cloud.dataproc.v1.ClusterOperationMetadata;
import com.google.protobuf.Empty;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import org.hamcrest.CoreMatchers;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

@RunWith(JUnit4.class)
public class CreateClusterWithAutoscalingTest {

private static final String CLUSTER_NAME =
String.format("java-as-test-%s", UUID.randomUUID().toString());
private static final String REGION = "us-central1";
private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
private static final String AUTOSCALING_POLICY_NAME =
String.format("java-as-test-%s", UUID.randomUUID().toString());

private ByteArrayOutputStream bout;

private static void requireEnv(String varName) {
assertNotNull(
String.format("Environment variable '%s' is required to perform these tests.", varName),
System.getenv(varName));
}

@BeforeClass
public static void checkRequirements() {
requireEnv("GOOGLE_APPLICATION_CREDENTIALS");
requireEnv("GOOGLE_CLOUD_PROJECT");
}

@Before
public void setUp() {
bout = new ByteArrayOutputStream();
System.setOut(new PrintStream(bout));
}

@After
public void tearDown() throws IOException, InterruptedException, ExecutionException {
String myEndpoint = String.format("%s-dataproc.googleapis.com:443", REGION);

ClusterControllerSettings clusterControllerSettings =
ClusterControllerSettings.newBuilder().setEndpoint(myEndpoint).build();

AutoscalingPolicyServiceSettings autoscalingPolicyServiceSettings =
AutoscalingPolicyServiceSettings.newBuilder().setEndpoint(myEndpoint).build();

try (ClusterControllerClient clusterControllerClient =
ClusterControllerClient.create(clusterControllerSettings);
AutoscalingPolicyServiceClient autoscalingPolicyServiceClient =
AutoscalingPolicyServiceClient.create(autoscalingPolicyServiceSettings)) {

OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest =
clusterControllerClient.deleteClusterAsync(PROJECT_ID, REGION, CLUSTER_NAME);
deleteClusterAsyncRequest.get();

AutoscalingPolicyName name =
AutoscalingPolicyName.ofProjectLocationAutoscalingPolicyName(
PROJECT_ID, REGION, AUTOSCALING_POLICY_NAME);
autoscalingPolicyServiceClient.deleteAutoscalingPolicy(name);
}
}

@Test
public void createClusterWithAutoscalingTest() throws IOException, InterruptedException {
CreateClusterWithAutoscaling.createClusterwithAutoscaling(
PROJECT_ID, REGION, CLUSTER_NAME, AUTOSCALING_POLICY_NAME);
String output = bout.toString();

assertThat(output, CoreMatchers.containsString(CLUSTER_NAME));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
@RunWith(JUnit4.class)
public class InstantiateInlineWorkflowTemplateTest {


private static final String REGION = "us-central1";
private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");

Expand Down
Loading

0 comments on commit 4f1ec2c

Please sign in to comment.