diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml index 0e9d7899502..cecbe488961 100644 --- a/.github/workflows/mvn-verify-check.yml +++ b/.github/workflows/mvn-verify-check.yml @@ -27,7 +27,7 @@ env: COMMON_MVN_FLAGS: >- -Ddist.jar.compress=false -DskipTests - -Dskip + -Dmaven.scaladoc.skip jobs: get-shim-versions-from-dist: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eea7fe81387..1cc52e5472a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -218,6 +218,12 @@ for a single Spark version Shim alone. To this end in a pre-production build you can set the Boolean property `dist.jar.compress` to `false`, its default value is `true`. +Furthermore, after the first build execution on the clean repository the spark-rapids-jni +SNAPSHOT dependency typically does not change until the next nightly CI build, or the next install +to the local Maven repo if you are working on a change to the native code. So you can save +significant time spent on repeated unpacking these dependencies by adding `-Drapids.jni.unpack.skip` +to the `dist` build command. + The time saved is more significant if you are merely changing the `aggregator` module, or the `dist` module, or just incorporating changes from [spark-rapids-jni](https://github.com/NVIDIA/spark-rapids-jni/blob/branch-23.04/CONTRIBUTING.md#local-testing-of-cross-repo-contributions-cudf-spark-rapids-jni-and-spark-rapids) @@ -225,12 +231,12 @@ the `aggregator` module, or the `dist` module, or just incorporating changes fro For example, to quickly repackage `rapids-4-spark` after the initial `./build/buildall` you can iterate by invoking ```Bash -mvn package -pl dist -PnoSnapshots -Ddist.jar.compress=false +mvn package -pl dist -PnoSnapshots -Ddist.jar.compress=false -Drapids.jni.unpack.skip ``` or similarly ```Bash - ./build/buildall --rebuild-dist-only --option="-Ddist.jar.compress=false" + ./build/buildall --rebuild-dist-only --option="-Ddist.jar.compress=false -Drapids.jni.unpack.skip" ``` ## Code contributions @@ -282,7 +288,7 @@ Before proceeding with importing spark-rapids into IDEA or switching to a differ profile, execute the install phase with the corresponding `buildver`, e.g. for Spark 3.4.0: ```bash - mvn clean install -Dbuildver=340 -Dskip -DskipTests + mvn clean install -Dbuildver=340 -Dmaven.scaladoc.skip -DskipTests ``` ##### Importing the project diff --git a/aggregator/pom.xml b/aggregator/pom.xml index f2fc06a370f..8f8b6da47fc 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -39,6 +39,10 @@ com.nvidia.shaded.spark false none + **/* + initialize + + initialize @@ -73,7 +77,6 @@ maven-shade-plugin true - ${spark.version.classifier} org.slf4j:* @@ -108,13 +111,78 @@ main-${spark.version.classifier} - package + compile shade + + org.apache.maven.plugins + maven-antrun-plugin + + + init-dirs + initialize + run + + + + + + + + generate-build-info + none + + + create-aggregator-for-downstream-if-content-changed + run + process-classes + + + + + + + + Checking if need to recreate: ${aggJarForDownstream} + + + + + + + + + + + + + + + + + + + Aggregator jar unchanged + + + Aggregator jar changed, recreating final jar + + + + + + + + + + org.jacoco jacoco-maven-plugin diff --git a/build/buildall b/build/buildall index a700acad539..356efa2d46d 100755 --- a/build/buildall +++ b/build/buildall @@ -262,7 +262,7 @@ function build_single_shim() { -DskipTests \ -Dbuildver="$BUILD_VER" \ -Drat.skip="$SKIP_CHECKS" \ - -Dskip \ + -Dmaven.scaladoc.skip \ -Dmaven.scalastyle.skip="$SKIP_CHECKS" \ -pl aggregator -am > "$LOG_FILE" 2>&1 || { [[ "$LOG_FILE" != "/dev/tty" ]] && echo "$LOG_FILE:" && tail -20 "$LOG_FILE" || true @@ -303,5 +303,5 @@ time ( echo "Resuming from $joinShimBuildFrom build only using $BASE_VER" $MVN $FINAL_OP -rf $joinShimBuildFrom $MODULE_OPT $MVN_PROFILE_OPT $INCLUDED_BUILDVERS_OPT \ -Dbuildver="$BASE_VER" \ - -DskipTests -Dskip + -DskipTests -Dmaven.scaladoc.skip ) diff --git a/dist/pom.xml b/dist/pom.xml index dd46404e33d..a858d2865b5 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -45,6 +45,7 @@ ${project.build.directory}/${project.build.finalName}-${jni.classifier}.jar jar:file:${dist.jar.name}!/META-INF/maven/${project.groupId}/${project.artifactId}/pom.xml none + false @@ -323,6 +324,19 @@ + + copy-jni-and-ucx-classes + + process-resources + run + + + + + + + + verify @@ -447,6 +461,7 @@ self.log("... OK") unpack + ${rapids.jni.unpack.skip} @@ -454,14 +469,14 @@ self.log("... OK") spark-rapids-jni ${jni.classifier} META-INF/** - ${project.build.directory}/parallel-world + ${project.build.directory}/jni-deps true org.openucx jucx META-INF/** - ${project.build.directory}/parallel-world + ${project.build.directory}/jni-deps true diff --git a/dist/scripts/binary-dedupe.sh b/dist/scripts/binary-dedupe.sh index b28b1cfa69d..183e86b1524 100755 --- a/dist/scripts/binary-dedupe.sh +++ b/dist/scripts/binary-dedupe.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -157,7 +157,7 @@ mv "$SPARK3XX_COMMON_DIR" parallel-world/ # Determine the list of unshimmed class files UNSHIMMED_LIST_TXT=unshimmed-result.txt echo "$((++STEP))/ creating sorted list of unshimmed classes > $UNSHIMMED_LIST_TXT" -find . -name '*.class' -not -path './parallel-world/spark3*' | \ +find ./parallel-world -name '*.class' -not -path './parallel-world/spark3*' | \ cut -d/ -f 3- | sort > "$UNSHIMMED_LIST_TXT" function verify_same_sha_for_unshimmed() { diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index c992b073eae..6209d0b152a 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -47,7 +47,7 @@ integration_tests - ${project.build.directory}/extra-resources/rapids4spark-version-info.properties + ${project.build.outputDirectory}/rapids4spark-version-info.properties integration_tests diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index 2f8e7686cab..8a0b25a0c95 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -150,7 +150,7 @@ $MVN_CMD -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests $MVN_OPT if [[ "$WITH_DEFAULT_UPSTREAM_SHIM" != "0" ]]; then echo "Building the default Spark shim and creating a two-shim dist jar" UPSTREAM_BUILDVER=$($MVN_CMD help:evaluate -q -pl dist -Dexpression=buildver -DforceStdout) - $MVN_CMD -B package -pl dist -am -DskipTests -Dskip $MVN_OPT \ + $MVN_CMD -B package -pl dist -am -DskipTests -Dmaven.scaladoc.skip $MVN_OPT \ -Dincluded_buildvers=$UPSTREAM_BUILDVER,$BUILDVER fi diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 5f3b33a108d..a13b5137af0 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -29,7 +29,7 @@ fi CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-'cuda11'} MVN_CMD="mvn -Dmaven.wagon.http.retryHandler.count=3" -MVN_BUILD_ARGS="-Drat.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER" +MVN_BUILD_ARGS="-Drat.skip=true -Dmaven.scaladoc.skip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER" mvn_verify() { echo "Run mvn verify..." diff --git a/pom.xml b/pom.xml index afb519ffc03..297492604de 100644 --- a/pom.xml +++ b/pom.xml @@ -819,6 +819,7 @@ false install ${spark.rapids.source.basedir}/.bloop + ${project.build.outputDirectory}/rapids4spark-version-info.properties @@ -966,30 +967,64 @@ + + setup-dirs + initialize + run + + + + + + + generate-build-info generate-resources - - - - - - + + + - - - - - - - + + +Comparing git revisions: + previous=${saved.build-info.revision} + current=${git.head.revision} + + + + + + +Git revisions unchanged: skipping version info file generation. +Delete ${build.info.path} or mvn clean if regeneration desired. +This will force full Scala code rebuild in downstream modules. + + + + Generating new version info file + + + + + + + + + + + + + + + @@ -1049,6 +1084,7 @@ org.apache.maven.plugins maven-compiler-plugin + 3.11.0 default-compile @@ -1118,8 +1154,8 @@ -Xfatal-warnings -Wconf:cat=lint-adapted-args:e - -Xsource:2.13 + initialize @@ -73,7 +77,6 @@ maven-shade-plugin true - ${spark.version.classifier} org.slf4j:* @@ -108,13 +111,78 @@ main-${spark.version.classifier} - package + compile shade + + org.apache.maven.plugins + maven-antrun-plugin + + + init-dirs + initialize + run + + + + + + + + generate-build-info + none + + + create-aggregator-for-downstream-if-content-changed + run + process-classes + + + + + + + + Checking if need to recreate: ${aggJarForDownstream} + + + + + + + + + + + + + + + + + + + Aggregator jar unchanged + + + Aggregator jar changed, recreating final jar + + + + + + + + + + org.jacoco jacoco-maven-plugin diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml index a065880fcfb..7e87dfe5f7c 100644 --- a/scala2.13/dist/pom.xml +++ b/scala2.13/dist/pom.xml @@ -45,6 +45,7 @@ ${project.build.directory}/${project.build.finalName}-${jni.classifier}.jar jar:file:${dist.jar.name}!/META-INF/maven/${project.groupId}/${project.artifactId}/pom.xml none + false @@ -323,6 +324,19 @@ + + copy-jni-and-ucx-classes + + process-resources + run + + + + + + + + verify @@ -447,6 +461,7 @@ self.log("... OK") unpack + ${rapids.jni.unpack.skip} @@ -454,14 +469,14 @@ self.log("... OK") spark-rapids-jni ${jni.classifier} META-INF/** - ${project.build.directory}/parallel-world + ${project.build.directory}/jni-deps true org.openucx jucx META-INF/** - ${project.build.directory}/parallel-world + ${project.build.directory}/jni-deps true diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 629692d6e65..fbc33b06cb5 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -819,6 +819,7 @@ false install ${spark.rapids.source.basedir}/.bloop + ${project.build.outputDirectory}/rapids4spark-version-info.properties @@ -966,30 +967,64 @@ + + setup-dirs + initialize + run + + + + + + + generate-build-info generate-resources - - - - - - + + + - - - - - - - + + +Comparing git revisions: + previous=${saved.build-info.revision} + current=${git.head.revision} + + + + + + +Git revisions unchanged: skipping version info file generation. +Delete ${build.info.path} or mvn clean if regeneration desired. +This will force full Scala code rebuild in downstream modules. + + + + Generating new version info file + + + + + + + + + + + + + + + @@ -1049,6 +1084,7 @@ org.apache.maven.plugins maven-compiler-plugin + 3.11.0 default-compile @@ -1118,8 +1154,8 @@ -Xfatal-warnings --> -Wconf:cat=lint-adapted-args:e - -Xsource:2.13 + -Xsource:2.13 -Ywarn-unused:locals,patvars,privates -Wconf:cat=deprecation:wv,any:e -Wconf:cat=scaladoc:wv diff --git a/scala2.13/sql-plugin/pom.xml b/scala2.13/sql-plugin/pom.xml index 02090fb5e7e..67f3f91c30f 100644 --- a/scala2.13/sql-plugin/pom.xml +++ b/scala2.13/sql-plugin/pom.xml @@ -154,7 +154,7 @@ @@ -171,13 +171,27 @@ run - - - - - + + + + + + + + + + Skipping shim service file generation, already exists + + + Recreating shim service file + + + + + diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml index 9d752b57f8d..9773cc91ba1 100644 --- a/sql-plugin/pom.xml +++ b/sql-plugin/pom.xml @@ -154,7 +154,7 @@ @@ -171,13 +171,27 @@ run - - - - - + + + + + + + + + + Skipping shim service file generation, already exists + + + Recreating shim service file + + + + + diff --git a/tests/README.md b/tests/README.md index 483c1309ec1..b53291b6839 100644 --- a/tests/README.md +++ b/tests/README.md @@ -7,46 +7,81 @@ and the code is in the `com.nvidia.spark.rapids.tests.mortgage` package. ## Unit Tests -Unit tests exist in the [tests]() directory. This is unconventional and is done, so we can run the -tests on the final shaded version of the plugin. It also helps with how we collect code coverage. +Unit tests implemented using the ScalaTest framework reside in the [tests]() directory. This is +unconventional and is done so we can run the tests on the close-to-final shaded single-shim version +of the plugin. It also helps with how we collect code coverage. -The `tests` module depends on the `aggregator` module which shades dependencies. When running the -tests via `mvn test`, make sure to run install command via `mvn install` for the aggregator jar to the -local maven repository. -The steps to run the unit tests: -```bash -cd -mvn clean install -cd tests -mvn test -``` +The `tests` module depends on the `aggregator` module which shades external dependencies and +aggregates them along with internal submodules into an artifact supporting a single Spark version. + +The minimum required Maven phase to run unit tests is `package`. Alternatively, you may run +`mvn install` and use `mvn test` for subsequent testing. However, to avoid dealing with stale jars +in the local Maven repo cache, we recommend to invoke `mvn package -pl tests -am ...` from the +`spark-rapids` root directory. Add `-f scala2.13` if you want to run unit tests against +Apache Spark dependencies based on Scala 2.13. + +To run targeted Scala tests use + +`-DwildcardSuites=` + +Or easier, use a combination of + +`-Dsuffixes=` to restrict the test suites being run, +which corresponds to `-q` option in the +[ScalaTest runner](https://www.scalatest.org/user_guide/using_the_runner). + +and + +`-Dtests=`, to restrict tests run within test suites, +which corresponds to `-z` or `-t` options in the +[ScalaTest runner](https://www.scalatest.org/user_guide/using_the_runner). -To run targeted Scala tests append `-DwildcardSuites=` to the above command. - For more information about using scalatest with Maven please refer to the -[scalatest documentation](https://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). - +[scalatest documentation](https://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin) +and the the +[source code](https://github.com/scalatest/scalatest-maven-plugin/blob/383f396162b7654930758b76a0696d3aa2ce5686/src/main/java/org/scalatest/tools/maven/AbstractScalaTestMojo.java#L34). + + #### Running Unit Tests Against Specific Apache Spark Versions You can run the unit tests against different versions of Spark using the different profiles. The -default version runs against Spark 3.1.1, to run against a specific version use one of the following -profiles: - - `-Prelease311` (Spark 3.1.1) - - `-Prelease321` (Spark 3.2.1) - - `-Prelease322` (Spark 3.2.2) - - `-Prelease330` (Spark 3.3.0) - - `-Prelease340` (Spark 3.4.0) +default version runs against Spark 3.1.1, to run against a specific version use a buildver property: + +- `-Dbuildver=311` (Spark 3.1.1) +- `-Dbuildver=350` (Spark 3.5.0) + +etc Please refer to the [tests project POM](pom.xml) to see the list of test profiles supported. Apache Spark specific configurations can be passed in by setting the `SPARK_CONF` environment variable. -Examples: -- To run tests against Apache Spark 3.2.1, - `mvn -Prelease321 test` -- To pass Apache Spark configs `--conf spark.dynamicAllocation.enabled=false --conf spark.task.cpus=1` do something like. - `SPARK_CONF="spark.dynamicAllocation.enabled=false,spark.task.cpus=1" mvn ...` -- To run test ParquetWriterSuite in package com.nvidia.spark.rapids, issue `mvn test -DwildcardSuites="com.nvidia.spark.rapids.ParquetWriterSuite"` +Examples: + +To run all tests against Apache Spark 3.2.1, + +```bash +mvn package -pl tests -am -Dbuildver=321 +``` + +To pass Apache Spark configs `--conf spark.dynamicAllocation.enabled=false --conf spark.task.cpus=1` +do something like. + +```bash +SPARK_CONF="spark.dynamicAllocation.enabled=false,spark.task.cpus=1" mvn ... +``` + +To run all tests in `ParquetWriterSuite` in package com.nvidia.spark.rapids, issue + +```bash +mvn package -pl tests -am -DwildcardSuites="com.nvidia.spark.rapids.ParquetWriterSuite" +``` + +To run all AnsiCastOpSuite and CastOpSuite tests dealing with decimals using +Apache Spark 3.3.0 on Scala 2.13 artifacts, issue: + +```bash +mvn package -pl tests -am -Dbuildver=330 -Dsuffixes='.*CastOpSuite' -Dtests=decimal +``` ## Integration Tests