diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/.mill-version b/website/blog/modules/ROOT/attachments/fast-assembly/.mill-version new file mode 100644 index 00000000000..cc96715b285 --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/.mill-version @@ -0,0 +1 @@ +0.12.7 \ No newline at end of file diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/MavenWrapperDownloader.java b/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/MavenWrapperDownloader.java new file mode 100644 index 00000000000..e76d1f3241d --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/MavenWrapperDownloader.java @@ -0,0 +1,117 @@ +/* + * Copyright 2007-present the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.net.*; +import java.io.*; +import java.nio.channels.*; +import java.util.Properties; + +public class MavenWrapperDownloader { + + private static final String WRAPPER_VERSION = "0.5.6"; + /** + * Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided. + */ + private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/" + + WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar"; + + /** + * Path to the maven-wrapper.properties file, which might contain a downloadUrl property to + * use instead of the default one. + */ + private static final String MAVEN_WRAPPER_PROPERTIES_PATH = + ".mvn/wrapper/maven-wrapper.properties"; + + /** + * Path where the maven-wrapper.jar will be saved to. + */ + private static final String MAVEN_WRAPPER_JAR_PATH = + ".mvn/wrapper/maven-wrapper.jar"; + + /** + * Name of the property which should be used to override the default download url for the wrapper. + */ + private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl"; + + public static void main(String args[]) { + System.out.println("- Downloader started"); + File baseDirectory = new File(args[0]); + System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath()); + + // If the maven-wrapper.properties exists, read it and check if it contains a custom + // wrapperUrl parameter. + File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH); + String url = DEFAULT_DOWNLOAD_URL; + if(mavenWrapperPropertyFile.exists()) { + FileInputStream mavenWrapperPropertyFileInputStream = null; + try { + mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile); + Properties mavenWrapperProperties = new Properties(); + mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream); + url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url); + } catch (IOException e) { + System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'"); + } finally { + try { + if(mavenWrapperPropertyFileInputStream != null) { + mavenWrapperPropertyFileInputStream.close(); + } + } catch (IOException e) { + // Ignore ... + } + } + } + System.out.println("- Downloading from: " + url); + + File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH); + if(!outputFile.getParentFile().exists()) { + if(!outputFile.getParentFile().mkdirs()) { + System.out.println( + "- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'"); + } + } + System.out.println("- Downloading to: " + outputFile.getAbsolutePath()); + try { + downloadFileFromURL(url, outputFile); + System.out.println("Done"); + System.exit(0); + } catch (Throwable e) { + System.out.println("- Error downloading"); + e.printStackTrace(); + System.exit(1); + } + } + + private static void downloadFileFromURL(String urlString, File destination) throws Exception { + if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) { + String username = System.getenv("MVNW_USERNAME"); + char[] password = System.getenv("MVNW_PASSWORD").toCharArray(); + Authenticator.setDefault(new Authenticator() { + @Override + protected PasswordAuthentication getPasswordAuthentication() { + return new PasswordAuthentication(username, password); + } + }); + } + URL website = new URL(urlString); + ReadableByteChannel rbc; + rbc = Channels.newChannel(website.openStream()); + FileOutputStream fos = new FileOutputStream(destination); + fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); + fos.close(); + rbc.close(); + } + +} diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/maven-wrapper.jar b/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/maven-wrapper.jar new file mode 100644 index 00000000000..2cc7d4a55c0 Binary files /dev/null and b/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/maven-wrapper.jar differ diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/maven-wrapper.properties b/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 00000000000..abd303b6738 --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1,2 @@ +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.2/apache-maven-3.8.2-bin.zip +wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/build.mill b/website/blog/modules/ROOT/attachments/fast-assembly/build.mill new file mode 100644 index 00000000000..3aaee780f67 --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/build.mill @@ -0,0 +1,12 @@ +package build +import mill._, scalalib._ + +object `package` extends RootModule with SbtModule { + def scalaVersion = "2.12.19" + def ivyDeps = Agg( + ivy"org.apache.spark::spark-core:3.5.4", + ivy"org.apache.spark::spark-sql:3.5.4" + ) + + def prependShellScript = "" +} \ No newline at end of file diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/build.sbt b/website/blog/modules/ROOT/attachments/fast-assembly/build.sbt new file mode 100644 index 00000000000..1f72b17ce69 --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/build.sbt @@ -0,0 +1,16 @@ +lazy val root = (project in file(".")) + .enablePlugins(AssemblyPlugin) // Enables sbt-assembly + .settings( + name := "spark-app", + version := "0.1", + scalaVersion := "2.12.19", + libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % "3.5.4", + "org.apache.spark" %% "spark-sql" % "3.5.4", + ), + assembly / assemblyMergeStrategy := { + case PathList("META-INF", "services", _*) => MergeStrategy.concat + case PathList("META-INF", xs @ _*) => MergeStrategy.discard + case x => MergeStrategy.first + } + ) \ No newline at end of file diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/mill b/website/blog/modules/ROOT/attachments/fast-assembly/mill new file mode 100755 index 00000000000..5102f00fc92 --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/mill @@ -0,0 +1,265 @@ +#!/usr/bin/env sh + +# This is a wrapper script, that automatically download mill from GitHub release pages +# You can give the required mill version with --mill-version parameter +# If no version is given, it falls back to the value of DEFAULT_MILL_VERSION +# +# Original Project page: https://github.com/lefou/millw +# Script Version: 0.4.12 +# +# If you want to improve this script, please also contribute your changes back! +# +# Licensed under the Apache License, Version 2.0 + +set -e + +if [ -z "${DEFAULT_MILL_VERSION}" ] ; then + DEFAULT_MILL_VERSION="0.11.4" +fi + + +if [ -z "${GITHUB_RELEASE_CDN}" ] ; then + GITHUB_RELEASE_CDN="" +fi + + +MILL_REPO_URL="https://github.com/com-lihaoyi/mill" + +if [ -z "${CURL_CMD}" ] ; then + CURL_CMD=curl +fi + +# Explicit commandline argument takes precedence over all other methods +if [ "$1" = "--mill-version" ] ; then + shift + if [ "x$1" != "x" ] ; then + MILL_VERSION="$1" + shift + else + echo "You specified --mill-version without a version." 1>&2 + echo "Please provide a version that matches one provided on" 1>&2 + echo "${MILL_REPO_URL}/releases" 1>&2 + false + fi +fi + +# Please note, that if a MILL_VERSION is already set in the environment, +# We reuse it's value and skip searching for a value. + +# If not already set, read .mill-version file +if [ -z "${MILL_VERSION}" ] ; then + if [ -f ".mill-version" ] ; then + MILL_VERSION="$(tr '\r' '\n' < .mill-version | head -n 1 2> /dev/null)" + elif [ -f ".config/mill-version" ] ; then + MILL_VERSION="$(tr '\r' '\n' < .config/mill-version | head -n 1 2> /dev/null)" + fi +fi + +MILL_USER_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/mill" + +if [ -z "${MILL_DOWNLOAD_PATH}" ] ; then + MILL_DOWNLOAD_PATH="${MILL_USER_CACHE_DIR}/download" +fi + +# If not already set, try to fetch newest from Github +if [ -z "${MILL_VERSION}" ] ; then + # TODO: try to load latest version from release page + echo "No mill version specified." 1>&2 + echo "You should provide a version via '.mill-version' file or --mill-version option." 1>&2 + + mkdir -p "${MILL_DOWNLOAD_PATH}" + LANG=C touch -d '1 hour ago' "${MILL_DOWNLOAD_PATH}/.expire_latest" 2>/dev/null || ( + # we might be on OSX or BSD which don't have -d option for touch + # but probably a -A [-][[hh]mm]SS + touch "${MILL_DOWNLOAD_PATH}/.expire_latest"; touch -A -010000 "${MILL_DOWNLOAD_PATH}/.expire_latest" + ) || ( + # in case we still failed, we retry the first touch command with the intention + # to show the (previously suppressed) error message + LANG=C touch -d '1 hour ago' "${MILL_DOWNLOAD_PATH}/.expire_latest" + ) + + # POSIX shell variant of bash's -nt operator, see https://unix.stackexchange.com/a/449744/6993 + # if [ "${MILL_DOWNLOAD_PATH}/.latest" -nt "${MILL_DOWNLOAD_PATH}/.expire_latest" ] ; then + if [ -n "$(find -L "${MILL_DOWNLOAD_PATH}/.latest" -prune -newer "${MILL_DOWNLOAD_PATH}/.expire_latest")" ]; then + # we know a current latest version + MILL_VERSION=$(head -n 1 "${MILL_DOWNLOAD_PATH}"/.latest 2> /dev/null) + fi + + if [ -z "${MILL_VERSION}" ] ; then + # we don't know a current latest version + echo "Retrieving latest mill version ..." 1>&2 + LANG=C ${CURL_CMD} -s -i -f -I ${MILL_REPO_URL}/releases/latest 2> /dev/null | grep --ignore-case Location: | sed s'/^.*tag\///' | tr -d '\r\n' > "${MILL_DOWNLOAD_PATH}/.latest" + MILL_VERSION=$(head -n 1 "${MILL_DOWNLOAD_PATH}"/.latest 2> /dev/null) + fi + + if [ -z "${MILL_VERSION}" ] ; then + # Last resort + MILL_VERSION="${DEFAULT_MILL_VERSION}" + echo "Falling back to hardcoded mill version ${MILL_VERSION}" 1>&2 + else + echo "Using mill version ${MILL_VERSION}" 1>&2 + fi +fi + +MILL_NATIVE_SUFFIX="-native" +FULL_MILL_VERSION=$MILL_VERSION +ARTIFACT_SUFFIX="" +case "$MILL_VERSION" in + *"$MILL_NATIVE_SUFFIX") + MILL_VERSION=${MILL_VERSION%"$MILL_NATIVE_SUFFIX"} + if [ "$(expr substr $(uname -s) 1 5 2>/dev/null)" = "Linux" ]; then + if [ "$(uname -m)" = "aarch64" ]; then + ARTIFACT_SUFFIX="-native-linux-aarch64" + else + ARTIFACT_SUFFIX="-native-linux-amd64" + fi + elif [ "$(uname)" = "Darwin" ]; then + if [ "$(uname -m)" = "arm64" ]; then + ARTIFACT_SUFFIX="-native-mac-aarch64" + else + ARTIFACT_SUFFIX="-native-mac-amd64" + fi + else + echo "This native mill launcher supports only Linux and macOS." 1>&2 + exit 1 + fi +esac + +MILL="${MILL_DOWNLOAD_PATH}/${FULL_MILL_VERSION}" + +try_to_use_system_mill() { + if [ "$(uname)" != "Linux" ]; then + return 0 + fi + + MILL_IN_PATH="$(command -v mill || true)" + + if [ -z "${MILL_IN_PATH}" ]; then + return 0 + fi + + SYSTEM_MILL_FIRST_TWO_BYTES=$(head --bytes=2 "${MILL_IN_PATH}") + if [ "${SYSTEM_MILL_FIRST_TWO_BYTES}" = "#!" ]; then + # MILL_IN_PATH is (very likely) a shell script and not the mill + # executable, ignore it. + return 0 + fi + + SYSTEM_MILL_PATH=$(readlink -e "${MILL_IN_PATH}") + SYSTEM_MILL_SIZE=$(stat --format=%s "${SYSTEM_MILL_PATH}") + SYSTEM_MILL_MTIME=$(stat --format=%y "${SYSTEM_MILL_PATH}") + + if [ ! -d "${MILL_USER_CACHE_DIR}" ]; then + mkdir -p "${MILL_USER_CACHE_DIR}" + fi + + SYSTEM_MILL_INFO_FILE="${MILL_USER_CACHE_DIR}/system-mill-info" + if [ -f "${SYSTEM_MILL_INFO_FILE}" ]; then + parseSystemMillInfo() { + LINE_NUMBER="${1}" + # Select the line number of the SYSTEM_MILL_INFO_FILE, cut the + # variable definition in that line in two halves and return + # the value, and finally remove the quotes. + sed -n "${LINE_NUMBER}p" "${SYSTEM_MILL_INFO_FILE}" |\ + cut -d= -f2 |\ + sed 's/"\(.*\)"/\1/' + } + + CACHED_SYSTEM_MILL_PATH=$(parseSystemMillInfo 1) + CACHED_SYSTEM_MILL_VERSION=$(parseSystemMillInfo 2) + CACHED_SYSTEM_MILL_SIZE=$(parseSystemMillInfo 3) + CACHED_SYSTEM_MILL_MTIME=$(parseSystemMillInfo 4) + + if [ "${SYSTEM_MILL_PATH}" = "${CACHED_SYSTEM_MILL_PATH}" ] \ + && [ "${SYSTEM_MILL_SIZE}" = "${CACHED_SYSTEM_MILL_SIZE}" ] \ + && [ "${SYSTEM_MILL_MTIME}" = "${CACHED_SYSTEM_MILL_MTIME}" ]; then + if [ "${CACHED_SYSTEM_MILL_VERSION}" = "${MILL_VERSION}" ]; then + MILL="${SYSTEM_MILL_PATH}" + return 0 + else + return 0 + fi + fi + fi + + SYSTEM_MILL_VERSION=$(${SYSTEM_MILL_PATH} --version | head -n1 | sed -n 's/^Mill.*version \(.*\)/\1/p') + + cat < "${SYSTEM_MILL_INFO_FILE}" +CACHED_SYSTEM_MILL_PATH="${SYSTEM_MILL_PATH}" +CACHED_SYSTEM_MILL_VERSION="${SYSTEM_MILL_VERSION}" +CACHED_SYSTEM_MILL_SIZE="${SYSTEM_MILL_SIZE}" +CACHED_SYSTEM_MILL_MTIME="${SYSTEM_MILL_MTIME}" +EOF + + if [ "${SYSTEM_MILL_VERSION}" = "${MILL_VERSION}" ]; then + MILL="${SYSTEM_MILL_PATH}" + fi +} +try_to_use_system_mill + +# If not already downloaded, download it +if [ ! -s "${MILL}" ] ; then + + # support old non-XDG download dir + MILL_OLD_DOWNLOAD_PATH="${HOME}/.mill/download" + OLD_MILL="${MILL_OLD_DOWNLOAD_PATH}/${MILL_VERSION}" + if [ -x "${OLD_MILL}" ] ; then + MILL="${OLD_MILL}" + else + case $MILL_VERSION in + 0.0.* | 0.1.* | 0.2.* | 0.3.* | 0.4.* ) + DOWNLOAD_SUFFIX="" + DOWNLOAD_FROM_MAVEN=0 + ;; + 0.5.* | 0.6.* | 0.7.* | 0.8.* | 0.9.* | 0.10.* | 0.11.0-M* ) + DOWNLOAD_SUFFIX="-assembly" + DOWNLOAD_FROM_MAVEN=0 + ;; + *) + DOWNLOAD_SUFFIX="-assembly" + DOWNLOAD_FROM_MAVEN=1 + ;; + esac + + DOWNLOAD_FILE=$(mktemp mill.XXXXXX) + + if [ "$DOWNLOAD_FROM_MAVEN" = "1" ] ; then + DOWNLOAD_URL="https://repo1.maven.org/maven2/com/lihaoyi/mill-dist${ARTIFACT_SUFFIX}/${MILL_VERSION}/mill-dist${ARTIFACT_SUFFIX}-${MILL_VERSION}.jar" + else + MILL_VERSION_TAG=$(echo "$MILL_VERSION" | sed -E 's/([^-]+)(-M[0-9]+)?(-.*)?/\1\2/') + DOWNLOAD_URL="${GITHUB_RELEASE_CDN}${MILL_REPO_URL}/releases/download/${MILL_VERSION_TAG}/${MILL_VERSION}${DOWNLOAD_SUFFIX}" + unset MILL_VERSION_TAG + fi + + # TODO: handle command not found + echo "Downloading mill ${MILL_VERSION} from ${DOWNLOAD_URL} ..." 1>&2 + ${CURL_CMD} -f -L -o "${DOWNLOAD_FILE}" "${DOWNLOAD_URL}" + chmod +x "${DOWNLOAD_FILE}" + mkdir -p "${MILL_DOWNLOAD_PATH}" + mv "${DOWNLOAD_FILE}" "${MILL}" + + unset DOWNLOAD_FILE + unset DOWNLOAD_SUFFIX + fi +fi + +if [ -z "$MILL_MAIN_CLI" ] ; then + MILL_MAIN_CLI="${0}" +fi + +MILL_FIRST_ARG="" +if [ "$1" = "--bsp" ] || [ "$1" = "-i" ] || [ "$1" = "--interactive" ] || [ "$1" = "--no-server" ] || [ "$1" = "--repl" ] || [ "$1" = "--help" ] ; then + # Need to preserve the first position of those listed options + MILL_FIRST_ARG=$1 + shift +fi + +unset MILL_DOWNLOAD_PATH +unset MILL_OLD_DOWNLOAD_PATH +unset OLD_MILL +unset MILL_VERSION +unset MILL_REPO_URL + +# We don't quote MILL_FIRST_ARG on purpose, so we can expand the empty value without quotes +# shellcheck disable=SC2086 +exec "${MILL}" $MILL_FIRST_ARG -D "mill.main.cli=${MILL_MAIN_CLI}" "$@" \ No newline at end of file diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/mvnw b/website/blog/modules/ROOT/attachments/fast-assembly/mvnw new file mode 100755 index 00000000000..a16b5431b4c --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/mvnw @@ -0,0 +1,310 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Maven Start Up Batch script +# +# Required ENV vars: +# ------------------ +# JAVA_HOME - location of a JDK home dir +# +# Optional ENV vars +# ----------------- +# M2_HOME - location of maven2's installed home dir +# MAVEN_OPTS - parameters passed to the Java VM when running Maven +# e.g. to debug Maven itself, use +# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +# MAVEN_SKIP_RC - flag to disable loading of mavenrc files +# ---------------------------------------------------------------------------- + +if [ -z "$MAVEN_SKIP_RC" ] ; then + + if [ -f /etc/mavenrc ] ; then + . /etc/mavenrc + fi + + if [ -f "$HOME/.mavenrc" ] ; then + . "$HOME/.mavenrc" + fi + +fi + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false; +darwin=false; +mingw=false +case "`uname`" in + CYGWIN*) cygwin=true ;; + MINGW*) mingw=true;; + Darwin*) darwin=true + # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home + # See https://developer.apple.com/library/mac/qa/qa1170/_index.html + if [ -z "$JAVA_HOME" ]; then + if [ -x "/usr/libexec/java_home" ]; then + export JAVA_HOME="`/usr/libexec/java_home`" + else + export JAVA_HOME="/Library/Java/Home" + fi + fi + ;; +esac + +if [ -z "$JAVA_HOME" ] ; then + if [ -r /etc/gentoo-release ] ; then + JAVA_HOME=`java-config --jre-home` + fi +fi + +if [ -z "$M2_HOME" ] ; then + ## resolve links - $0 may be a link to maven's home + PRG="$0" + + # need this for relative symlinks + while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG="`dirname "$PRG"`/$link" + fi + done + + saveddir=`pwd` + + M2_HOME=`dirname "$PRG"`/.. + + # make it fully qualified + M2_HOME=`cd "$M2_HOME" && pwd` + + cd "$saveddir" + # echo Using m2 at $M2_HOME +fi + +# For Cygwin, ensure paths are in UNIX format before anything is touched +if $cygwin ; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --unix "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --unix "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --unix "$CLASSPATH"` +fi + +# For Mingw, ensure paths are in UNIX format before anything is touched +if $mingw ; then + [ -n "$M2_HOME" ] && + M2_HOME="`(cd "$M2_HOME"; pwd)`" + [ -n "$JAVA_HOME" ] && + JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" +fi + +if [ -z "$JAVA_HOME" ]; then + javaExecutable="`which javac`" + if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then + # readlink(1) is not available as standard on Solaris 10. + readLink=`which readlink` + if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then + if $darwin ; then + javaHome="`dirname \"$javaExecutable\"`" + javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" + else + javaExecutable="`readlink -f \"$javaExecutable\"`" + fi + javaHome="`dirname \"$javaExecutable\"`" + javaHome=`expr "$javaHome" : '\(.*\)/bin'` + JAVA_HOME="$javaHome" + export JAVA_HOME + fi + fi +fi + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + else + JAVACMD="`which java`" + fi +fi + +if [ ! -x "$JAVACMD" ] ; then + echo "Error: JAVA_HOME is not defined correctly." >&2 + echo " We cannot execute $JAVACMD" >&2 + exit 1 +fi + +if [ -z "$JAVA_HOME" ] ; then + echo "Warning: JAVA_HOME environment variable is not set." +fi + +CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher + +# traverses directory structure from process work directory to filesystem root +# first directory with .mvn subdirectory is considered project base directory +find_maven_basedir() { + + if [ -z "$1" ] + then + echo "Path not specified to find_maven_basedir" + return 1 + fi + + basedir="$1" + wdir="$1" + while [ "$wdir" != '/' ] ; do + if [ -d "$wdir"/.mvn ] ; then + basedir=$wdir + break + fi + # workaround for JBEAP-8937 (on Solaris 10/Sparc) + if [ -d "${wdir}" ]; then + wdir=`cd "$wdir/.."; pwd` + fi + # end of workaround + done + echo "${basedir}" +} + +# concatenates all lines of a file +concat_lines() { + if [ -f "$1" ]; then + echo "$(tr -s '\n' ' ' < "$1")" + fi +} + +BASE_DIR=`find_maven_basedir "$(pwd)"` +if [ -z "$BASE_DIR" ]; then + exit 1; +fi + +########################################################################################## +# Extension to allow automatically downloading the maven-wrapper.jar from Maven-central +# This allows using the maven wrapper in projects that prohibit checking in binary data. +########################################################################################## +if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found .mvn/wrapper/maven-wrapper.jar" + fi +else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..." + fi + if [ -n "$MVNW_REPOURL" ]; then + jarUrl="$MVNW_REPOURL/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" + else + jarUrl="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" + fi + while IFS="=" read key value; do + case "$key" in (wrapperUrl) jarUrl="$value"; break ;; + esac + done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties" + if [ "$MVNW_VERBOSE" = true ]; then + echo "Downloading from: $jarUrl" + fi + wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" + if $cygwin; then + wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"` + fi + + if command -v wget > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found wget ... using wget" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + wget "$jarUrl" -O "$wrapperJarPath" + else + wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" + fi + elif command -v curl > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found curl ... using curl" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + curl -o "$wrapperJarPath" "$jarUrl" -f + else + curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f + fi + + else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Falling back to using Java to download" + fi + javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java" + # For Cygwin, switch paths to Windows format before running javac + if $cygwin; then + javaClass=`cygpath --path --windows "$javaClass"` + fi + if [ -e "$javaClass" ]; then + if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Compiling MavenWrapperDownloader.java ..." + fi + # Compiling the Java class + ("$JAVA_HOME/bin/javac" "$javaClass") + fi + if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + # Running the downloader + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Running MavenWrapperDownloader.java ..." + fi + ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR") + fi + fi + fi +fi +########################################################################################## +# End of extension +########################################################################################## + +export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} +if [ "$MVNW_VERBOSE" = true ]; then + echo $MAVEN_PROJECTBASEDIR +fi +MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" + +# For Cygwin, switch paths to Windows format before running java +if $cygwin; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --path --windows "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --windows "$CLASSPATH"` + [ -n "$MAVEN_PROJECTBASEDIR" ] && + MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` +fi + +# Provide a "standardized" way to retrieve the CLI args that will +# work with both Windows and non-Windows executions. +MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" +export MAVEN_CMD_LINE_ARGS + +WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +exec "$JAVACMD" \ + $MAVEN_OPTS \ + -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ + "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ + ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/pom.xml b/website/blog/modules/ROOT/attachments/fast-assembly/pom.xml new file mode 100644 index 00000000000..378956c2f88 --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/pom.xml @@ -0,0 +1,80 @@ + + 4.0.0 + + com.example + spark-app + 0.1 + jar + + + 2.12.19 + 3.5.4 + 1.8 + 1.8 + + + + + org.apache.spark + spark-core_2.12 + ${spark.version} + + + org.apache.spark + spark-sql_2.12 + ${spark.version} + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.6.0 + + jar-with-dependencies + foo.Foo + + + + make-assembly + package + + single + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + ${maven.compiler.source} + ${maven.compiler.target} + + + + + + net.alchim31.maven + scala-maven-plugin + 4.7.1 + + + + compile + testCompile + + + + + + + \ No newline at end of file diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/project/build.properties b/website/blog/modules/ROOT/attachments/fast-assembly/project/build.properties new file mode 100644 index 00000000000..c7450fc2a96 --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.10.5 \ No newline at end of file diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/project/plugins.sbt b/website/blog/modules/ROOT/attachments/fast-assembly/project/plugins.sbt new file mode 100644 index 00000000000..09148954f3d --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/project/plugins.sbt @@ -0,0 +1 @@ +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.3.1") \ No newline at end of file diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/src/main/resources/transactions.csv b/website/blog/modules/ROOT/attachments/fast-assembly/src/main/resources/transactions.csv new file mode 100644 index 00000000000..8f25477703b --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/src/main/resources/transactions.csv @@ -0,0 +1,8 @@ +id,category,amount +1,Food,20.5 +2,Electronics,250.0 +3,Food,35.0 +4,Clothing,45.5 +5,Food,15.0 +6,Electronics,125.0 +7,Clothing,75.0 \ No newline at end of file diff --git a/website/blog/modules/ROOT/attachments/fast-assembly/src/main/scala/foo/Foo.scala b/website/blog/modules/ROOT/attachments/fast-assembly/src/main/scala/foo/Foo.scala new file mode 100644 index 00000000000..b157c616582 --- /dev/null +++ b/website/blog/modules/ROOT/attachments/fast-assembly/src/main/scala/foo/Foo.scala @@ -0,0 +1,44 @@ +package foo + +import org.apache.spark.sql.{SparkSession, Dataset, DataFrame} +import org.apache.spark.sql.functions._ + +object Foo { + + case class Transaction(id: Int, category: String, amount: Double) + + def computeSummary(transactions: Dataset[Transaction]): DataFrame = { + transactions.groupBy("category") + .agg( + sum("amount").alias("total_amount"), + avg("amount").alias("average_amount"), + count("amount").alias("transaction_count") + ) + } + + def main(args: Array[String]): Unit = { + val spark = SparkSession.builder() + .appName("SparkExample") + .master("local[*]") + .getOrCreate() + + val resourcePath: String = args(0) + + import spark.implicits._ + + val df = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(resourcePath) + + val transactionsDS: Dataset[Transaction] = df.as[Transaction] + val summaryDF = computeSummary(transactionsDS) + + println("Summary Statistics by Category:") + summaryDF.show() + + spark.stop() + } +} + +object dummy2 diff --git a/website/blog/modules/ROOT/nav.adoc b/website/blog/modules/ROOT/nav.adoc index 1e0e9210f7d..ff03cec2fec 100644 --- a/website/blog/modules/ROOT/nav.adoc +++ b/website/blog/modules/ROOT/nav.adoc @@ -1,4 +1,5 @@ +* xref:9-mill-faster-assembly-jars.adoc[] * xref:8-what-is-a-build-tool.adoc[] * xref:7-graal-native-executables.adoc[] * xref:6-garbage-collector-perf.adoc[] diff --git a/website/blog/modules/ROOT/pages/9-mill-faster-assembly-jars.adoc b/website/blog/modules/ROOT/pages/9-mill-faster-assembly-jars.adoc new file mode 100644 index 00000000000..47ec46915b3 --- /dev/null +++ b/website/blog/modules/ROOT/pages/9-mill-faster-assembly-jars.adoc @@ -0,0 +1,583 @@ +// tag::header[] + +# Fast Incremental JVM Assembly Jar Creation with Mill + +:author: Li Haoyi +:revdate: 16 February 2025 + +_{author}, {revdate}_ + +Assembly jars are a convenient deployment format for JVM applications, bundling +your application code and resources into a single file that can run anywhere a JVM +is installed. But assembly jars can be slow to create, which can slow down iterative +development workflows that depend on them. The Mill JVM build tool uses some special +tricks to let you iterate on your assembly jars much faster than traditional build tools +like Maven or Gradle, cutting down their incremental creation time from 10s of seconds +to less than a second. This can substantially increase your developer productivity by +saving time you would otherwise spend waiting for your assembly to be created. + +// end::header[] + +## Example JVM Application + +For the purposes of this blog post, we will be using a small +https://spark.apache.org/[Apache Spark] program +as our example application. This program was written by https://github.com/monyedavid[@monyedavid] +to demonstrate how to xref:mill:ROOT:scalalib/spark.adoc[Build Spark Programs using Mill], +and does some simple processing of a CSV file to output summary statistics. This program +pulls in the `org.apache.spark::spark-core:3.5.4` and `org.apache.spark::spark-sql:3.5.4` +artifacts, which makes the JVM assembly jar pretty large. + +NOTE: For many Spark usage patterns, e.g. https://spark.apache.org/docs/latest/submitting-applications.html[spark-submit], +you do not actually need to include `spark-core` and `spark-sql` in the assembly jar, +as the Spark cluster will provide them. Nevertheless, any JVM +developer will likely encounter scenarios where large assemblies are necessary, +whether due to third-party libraries or non-spark frameworks. Similarly, although +the example Spark code is in Scala since that's what Spark uses, the same techniques +apply to any JVM language you may want to pack into a +self-contained assembly jar. + +```scala +package foo + +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ + +object Foo { + case class Transaction(id: Int, category: String, amount: Double) + + def computeSummary(transactions: Dataset[Transaction]): DataFrame = { + transactions.groupBy("category") + .agg( + sum("amount").alias("total_amount"), + avg("amount").alias("average_amount"), + count("amount").alias("transaction_count") + ) + } + + def main(args: Array[String]): Unit = { + val spark = SparkSession.builder() + .appName("SparkExample") + .master("local[*]") + .getOrCreate() + + val resourcePath: String = args(0) + + import spark.implicits._ + + val df = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(resourcePath) + + val transactionsDS: Dataset[Transaction] = df.as[Transaction] + val summaryDF = computeSummary(transactionsDS) + + println("Summary Statistics by Category:") + summaryDF.show() + + spark.stop() + } +} +``` + +## Example Builds + +To build this small program, we will set up equivalent builds using the +https://mill-build.org/[Mill build tool], https://www.scala-sbt.org/[SBT], +and https://maven.apache.org/[Maven] + +### Mill + +The Mill build config for this project is shown below, setting the `scalaVersion` +and the `ivyDeps`. There is a wrinkle in needing override `def prependShellScript`, +because Mill's xref:5-executable-jars.adoc[executable assembly jars] don't work +for large assemblies like this one. + +```scala +package build +import mill._, scalalib._ + +object `package` extends RootModule with SbtModule { + def scalaVersion = "2.12.19" + def ivyDeps = Agg( + ivy"org.apache.spark::spark-core:3.5.4", + ivy"org.apache.spark::spark-sql:3.5.4" + ) + + def prependShellScript = "" +} +``` + +From this build, we can use the `./mill` +xref:mill:ROOT:cli/installation-ide.adoc#_bootstrap_scripts[bootstrap script] +to build an assembly that we can run using `java -jar`: + +```bash +> ./mill show assembly +".../out/assembly.dest/out.jar" +Total time: 27s + +$ ls -lh out/assembly.dest/out.jar +-rw-r--r-- 1 lihaoyi staff 214M Feb 14 15:51 out/assembly.dest/out.jar + +> java --add-opens java.base/sun.nio.ch=ALL-UNNAMED -jar out/assembly.dest/out.jar src/main/resources/transactions.csv +... ++-----------+------------+--------------+-----------------+ +| category|total_amount|average_amount|transaction_count| ++-----------+------------+--------------+-----------------+ +| Food| 70.5| 23.5| 3| +|Electronics| 375.0| 187.5| 2| +| Clothing| 120.5| 60.25| 2| ++-----------+------------+--------------+-----------------+ +``` + + +### SBT + +The SBT build is similar to the Mill build above. Apart from a slightly different syntax, +SBT also needs you to specify `name` and `version` (Mill lets you skip these unless +you are publishing your module), and configure an `assemblyMergeStrategy`. SBT +also requires that you explicitly enable the `AssemblyPlugin`, whereas Mill comes with +that built in by default. + +```scala +lazy val root = (project in file(".")) + .enablePlugins(AssemblyPlugin) // Enables sbt-assembly + .settings( + name := "spark-app", + version := "0.1", + scalaVersion := "2.12.19", + libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % "3.5.4", + "org.apache.spark" %% "spark-sql" % "3.5.4", + ), + assembly / assemblyMergeStrategy := { + case PathList("META-INF", "services", _*) => MergeStrategy.concat + case PathList("META-INF", xs @ _*) => MergeStrategy.discard + case x => MergeStrategy.first + } + ) +``` + +You can then use `sbt assembly` to build a jar, and `java -jar` to execute it: + +```bash +> sbt assembly +Built: .../target/scala-2.12/spark-app-assembly-0.1.jar +Total time: 18s + +$ ls -lh target/scala-2.12/spark-app-assembly-0.1.jar +-rw-r--r-- 1 lihaoyi staff 213M Feb 14 15:58 target/scala-2.12/spark-app-assembly-0.1.jar + +> java --add-opens java.base/sun.nio.ch=ALL-UNNAMED -jar target/scala-2.12/spark-app-assembly-0.1.jar src/main/resources/transactions.csv +... ++-----------+------------+--------------+-----------------+ +| category|total_amount|average_amount|transaction_count| ++-----------+------------+--------------+-----------------+ +| Food| 70.5| 23.5| 3| +|Electronics| 375.0| 187.5| 2| +| Clothing| 120.5| 60.25| 2| ++-----------+------------+--------------+-----------------+ +``` + +### Maven + +The Maven build is by far the most verbose of the build configurations for this +example codebase, but it contains basically the same information: `scala.version`, +`spark.version` and dependencies on `spark-core` and `spark-sql`. Maven requires +you to enable the `maven-assembly-plugin` explicitly similar to SBT, and on top of +that requires you enable `maven-compiler-plugin` and `maven-scala-plugin`: + +```xml + + 4.0.0 + + com.example + spark-app + 0.1 + jar + + + 2.12.19 + 3.5.4 + 1.8 + 1.8 + + + + + org.apache.spark + spark-core_2.12 + ${spark.version} + + + org.apache.spark + spark-sql_2.12 + ${spark.version} + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.6.0 + + assembly + foo.Foo + + + + make-assembly + package + + single + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + ${maven.compiler.source} + ${maven.compiler.target} + + + + + + net.alchim31.maven + scala-maven-plugin + 4.7.1 + + + + compile + testCompile + + + + + + + +``` + +Once this is all set up, you can use `./mvnw package` to build the `jar-with-dependencies` +that you can execute with `java -jar`: + +```bash +> ./mvnw package +Building jar: .../target/spark-app-0.1-jar-with-dependencies.jar +Total time: 20s + +> ls -lh target/spark-app-0.1-jar-with-dependencies.jar +-rw-r--r-- 1 lihaoyi staff 211M Feb 14 16:12 target/spark-app-0.1-jar-with-dependencies.jar + +> java --add-opens java.base/sun.nio.ch=ALL-UNNAMED -jar target/spark-app-0.1-jar-with-dependencies.jar src/main/resources/transactions.csv +... ++-----------+------------+--------------+-----------------+ +| category|total_amount|average_amount|transaction_count| ++-----------+------------+--------------+-----------------+ +| Food| 70.5| 23.5| 3| +|Electronics| 375.0| 187.5| 2| +| Clothing| 120.5| 60.25| 2| ++-----------+------------+--------------+-----------------+ +``` + +We can see all 3 build tools take about 20s to build the assembly, with some +variation expected from run to run. All three jars are about the same size (~212mb), +which makes sense since they should contain the same local code and same +upstream dependencies. While 20s is a bit long, it's not that surprising +since the tool has to compress ~212mb of dependencies to assemble the into a jar file. + +## Incremental Builds + +While all JVM build tools take about the same amount of time for the initial build, +what is interesting is what happens for incremental builds. For example, below we +add a `class dummy` line of code to `Foo.scala` to force it to re-compile: + +```bash +> echo "class dummy" >> src/main/scala/foo/Foo.scala + +> ./mill show assembly +".../out/assembly.dest/out.jar" +Total time: 1s + +> sbt assembly +Built: .../target/scala-2.12/spark-app-assembly-0.1.jar +Total time: 20s + +> ./mvnw package +Building jar: .../target/spark-app-0.1-jar-with-dependencies.jar +Total time: 22s +``` + +Here, we can see that Mill only took `1s` to re-build the assembly jar, +while SBT and Maven took the same ~20s that they took the first time the +jar was built. If you play around with it, you will see that the assembly jar +does contain classfiles associated with our newly-added code: + +```bash +> jar tf out/assembly.dest/out.jar | grep dummy +foo/dummy.class + +> jar tf target/scala-2.12/spark-app-assembly-0.1.jar | grep dummy +foo/dummy.class + +> jar tf target/spark-app-0.1-jar-with-dependencies.jar | grep dummy +foo/dummy.class +``` + +You can try making other code changes, e.g. to the body of the spark program itself, +and running the output jar with `java -jar` to see that your changes are indeed +taking effect. So the question you may ask is: how is it that Mill is able to +rebuild it's output assembly jar in ~1s, while other build tools are +spending a whole ~20s rebuilding it? + +### Multi-Step Assemblies + +The trick to Mill's fast incremental rebuilding of assembly jars is to split the +assembly jar creation into three phases. + + +Typically, construction of an assembly jar is a slow single-step process. The +build tool has to take all third-party dependencies, local dependencies, and +the module being assembled, compress all their files and assemble them into a `.jar`: + +```graphviz +digraph G { + rankdir=LR + node [shape=box width=0 height=0] + third_party_libraries -> "assembly (slow)" + local_dependencies -> "assembly (slow)" + current_module -> "assembly (slow)" + third_party_libraries [shape=none] + local_dependencies [shape=none] + current_module [shape=none] +} +``` + +Mill instead does the assembly as a three-step process. In Mill, each of +`third_party_libraries`, `local_dependencies`, and `current_module` are +added one-by-one to construct the final jar: + +```graphviz +digraph G { + rankdir=LR + node [shape=box width=0 height=0] + third_party_libraries -> "upstream_thirdparty_assembly (slow)" + "upstream_thirdparty_assembly (slow)" -> "upstream_assembly (fast)" + local_dependencies -> "upstream_assembly (fast)" + "upstream_assembly (fast)" -> "assembly (fast)" + current_module -> "assembly (fast)" + third_party_libraries [shape=none] + local_dependencies [shape=none] + current_module [shape=none] +} +``` + +1. Third-party libraries are combined into an `upstream_thirdparty_assembly` +in the first step, which is slow but rarely needs to be re-run +2. Local upstream modules are combined with `upstream_thirdparty_assembly` +into a `upstream_assembly` in the second step, which needs to happen +more often but is fastest +3. The current module is combined into `upstream_assembly` in the third step, +which is the fastest step but needs to happen the most frequently. + + +The key here is that the intermediate `upstream_thirdparty_assembly` and +`upstream_assembly` jar files can be re-used. This means that although any changes +to `third_party_libraries` will still have to go through the slow process +of creating the assemblies from scratch: + +```graphviz +digraph G { + rankdir=LR + node [shape=box width=0 height=0] + third_party_libraries -> "upstream_thirdparty_assembly (slow)" [color=red penwidth=2] + "upstream_thirdparty_assembly (slow)" -> "upstream_assembly (fast)" [color=red penwidth=2] + local_dependencies -> "upstream_assembly (fast)" + "upstream_assembly (fast)" -> "assembly (fast)" [color=red penwidth=2] + current_module -> "assembly (fast)" + third_party_libraries [shape=none] + local_dependencies [shape=none] + current_module [shape=none] + "upstream_thirdparty_assembly (slow)" [color=red penwidth=2] + "upstream_assembly (fast)" [color=red penwidth=2] + "assembly (fast)" [color=red penwidth=2] +} +``` + +In exchange, any changes to `local_dependencies` can skip the slowest +`upstream_thirdparty_assembly` step, and only run `upstream_assembly` and `assembly`: + +```graphviz +digraph G { + rankdir=LR + node [shape=box width=0 height=0] + third_party_libraries -> "upstream_thirdparty_assembly (slow)" + "upstream_thirdparty_assembly (slow)" -> "upstream_assembly (fast)" + local_dependencies -> "upstream_assembly (fast)" [color=red penwidth=2] + "upstream_assembly (fast)" -> "assembly (fast)" [color=red penwidth=2] + current_module -> "assembly (fast)" + third_party_libraries [shape=none] + local_dependencies [shape=none] + current_module [shape=none] + "upstream_assembly (fast)" [color=red penwidth=2] + "assembly (fast)" [color=red penwidth=2] +} +``` +And changes to `current_module` can skip both upstream steps, only running the fast +`assembly` step: + +```graphviz +digraph G { + rankdir=LR + node [shape=box width=0 height=0] + third_party_libraries -> "upstream_thirdparty_assembly (slow)" + "upstream_thirdparty_assembly (slow)" -> "upstream_assembly (fast)" + local_dependencies -> "upstream_assembly (fast)" + "upstream_assembly (fast)" -> "assembly (fast)" + current_module -> "assembly (fast)" [color=red penwidth=2] + third_party_libraries [shape=none] + local_dependencies [shape=none] + current_module [shape=none] + "assembly (fast)" [color=red penwidth=2] +} +``` + +Building an assembly "clean" requires running all three steps and is just +as slow as the naive one-step assembly creation, as is the case where you change third +party dependencies. But in practice these scenarios tend to happen relatively infrequently: +perhaps once a day, or even less. In contrast, the scenarios where you are changing +code in local modules happens much more frequently, often several times a minute +while you are working on your code and adding ``println``s or tweaking its behavior. +Thus, although the _worst_ case building an assembly with Mill is no better than other +tools, the _average_ case can be substantially better with these optimizations. + +### Efficiently Updating Assembly Jars In Theory + +One core assumption of the section above is that creating a new assembly jar +based on an existing one with additional files included is fast. This is not +true for every file format - e.g. `.tar.gz` files are just as expensive to append to +as they are to build from scratch, as you need to de-compress and re-compress the whole +archive - but it is true for `.jar` archives. + +The key here is that `.jar` archives are just `.zip` files by another name, which +means two things: + +1. Every file within the `.jar` is compressed individually, so adding additional + files does not need existing files to be re-compressed + +2. The zip index storing the offsets and metadata of each file within the jar is + stored at the _end_ of the `.jar` file, meaning it is straightforward to + over-write the index with additional files and then write a _new_ index after + those new files without needing to move the existing files around the archive. + +Visually, a Zip file laid out on disk looks something like this, with each +file e.g. `Foo.class` or `MANIFEST.MF` compressed separately: + + +```graphviz +digraph G { + label="archive.zip" + node [shape=box width=0 height=0 style=filled fillcolor=white] + zip [shape=record label=" ...thirdparty dependencies... | MANIFEST.MF | central directory"] + zip:f2:n -> zip:f1:n [label="reverse offsets"] + zip:f2:n -> zip:f0:n +} +``` + +Thus, in order to add to the zip file, you can write any additional files to the +right of the last existing file (`MANIFEST.MF` above), and write an updated +`central directory` with updated pointers. Below, we see the additional of +a `Foo.class` fill to the existing archive, with the `thirdparty dependencies` and `MANIFEST.MF` +files left untouched and in place. + + +```graphviz +digraph G { + label="archive.zip" + node [shape=box width=0 height=0 style=filled fillcolor=white] + zip [shape=record label=" ...thirdparty dependencies... | MANIFEST.MF | Foo.class | central directory"] + zip:f4:n -> zip:f1:n [label="reverse offsets"] + zip:f4:n -> zip:f0:n + zip:f4:n -> zip:f2:n +} +``` + +When adding files to an existing archive, the existing files do not need to be processed at all, +making such an operation _O(added-files)_ rather than _O(total-number-of-files)_. You only +need to compress the additional files. You also need to update/rewrite the central directory +after the last added file with updated pointer offsets, but the central directory is +typically small so such an update/rewrite doesn't materially slow things down. + +NOTE: Earlier versions of Mill used a two-stage assembly where `upstream_thirdparty_assembly` +and `upstream_assembly` were combined, but the latest +https://github.com/com-lihaoyi/mill/blob/main/changelog.adoc#0128---2025-02-16[0.12.8 release] +moves to the three-stage assembly described here for better performance when iterating +and generating assemblies from multi-module projects. + +### Efficiently Updating Assembly Jars In Practice + +In practice, the way this works on the JVM (which is how the Mill build tool does it, +since the Mill is a JVM application) is as follows: + +1. Makes a copy of the upstream assembly. Copying a file is typically fast even + when the file is large, and allows the upstream assembly to be re-used later. + +2. Opens that copy using `java.nio.file.FileSystems.newFileSystem`, which allows you + to open an existing jar file by passing in `new URI("jar", path, null)` + +3. Modifies the returned `java.nio.file.FileSystem` using normal `java.nio.file.File` + operations + +Calling `FileSystems.newFileSystem` with a `"jar"` URL returns a +https://github.com/openjdk/jdk/blob/master/src/jdk.zipfs/share/classes/jdk/nio/zipfs/ZipFileSystem.java[ZipFileSystem]. +`ZipFileSystem` basically implements all the normal `java.nio.file.File.*` operations that +normally modifies files on disk, and replaces them with versions to instead modify +the entries inside a `.zip` file. And since `.zip` files have every file individually +compressed (unlike e.g. `.tar.gz` which compresses them together) `ZipFileSystem` is +able to efficiently read and write individual files to the `zip` file without needing +to un-pack and re-pack the entire archive. + +While we discussed how adding files +to a jar can be done efficiently, there is also subtlety around other operations such +as modifying files, removing files, etc. which are less trivial. But the JDK's built in +`ZipFileSystem` implements all these in a reasonable manner, and what is important is that +it allows Mill to incrementally update its assembly jars in (more or less) +_O(size-of-local-code)_, which is typically much smaller than the +_O(size-of-transitive-dependencies)_ which a naive assembly-jar creation process requires. + +## Conclusion + +This blog post has discussed how Mill is able to provide fast incremental updates to +generated assembly jars, in the example shown above it sped up Spark assembly jar creation +from ~20s to ~1s v.s. the equivalent workflow in other build tools like Maven or SBT. +This speedup can apply to any JVM codebase, although the benefit would depend on the +size of your local application code and its transitive dependencies. There is some overhead +to "clean build" assembly jars from scratch, but such scenarios typically happen much +less frequently than the "incremental update" scenario, and so the tradeoff can be worth it. + +Mill splits its assembly jars into three hardcoded "layers", but more sophisticated +update schemes are also possible. One could imagine a build tool that keeps track of +what files were put into the assembly jar previously, diff-ed that against the current +set of files, and did the copy-and-update only updating the files within the jar that +have changed outside of it. That would allow much more fine-grained incremental +updates to be done to the assembly jar, which may matter in large codebases where +Mill's hardcoded three-layer split aren't sufficient to keep things fast. + +It turns out there's no magic in Mill's fast assembly generation: just careful use of +the available APIs provided by the underlying JVM platform. Hopefully this approach +can eventually make its way to other build tools like Maven or SBT, so everyone can +benefit from the fast assembly jar creation that Mill provides today. diff --git a/website/blog/modules/ROOT/pages/index.adoc b/website/blog/modules/ROOT/pages/index.adoc index e9ec2bec962..762a377bc19 100644 --- a/website/blog/modules/ROOT/pages/index.adoc +++ b/website/blog/modules/ROOT/pages/index.adoc @@ -8,6 +8,10 @@ technical topics related to JVM platform tooling and language-agnostic build too some specific to the Mill build tool but mostly applicable to anyone working on build tooling for large codebases in JVM and non-JVM languages. +include::9-mill-faster-assembly-jars.adoc[tag=header,leveloffset=1] + +xref:9-mill-faster-assembly-jars.adoc[Read More...] + include::8-what-is-a-build-tool.adoc[tag=header,leveloffset=1] xref:8-what-is-a-build-tool.adoc[Read More...]