recommenders-team · simonzhaoms · Feb 14, 2022 · Dec 16, 2021 · Dec 20, 2021 · Dec 20, 2021
@@ -19,6 +19,7 @@ on:
       - .github/workflows/sarplus.yml
 
 env:
+  SARPLUS_ROOT: ${{ github.workspace }}/contrib/sarplus
   PYTHON_ROOT: ${{ github.workspace }}/contrib/sarplus/python
   SCALA_ROOT: ${{ github.workspace }}/contrib/sarplus/scala
 
@@ -52,15 +53,20 @@ jobs:
 
       - name: Package and check
         run: |
+          # build
           cd "${PYTHON_ROOT}"
-          cp ../VERSION ./pysarplus/
+          cp "${SARPLUS_ROOT}/VERSION" ./pysarplus/VERSION
           python -m build --sdist
           python -m twine check dist/*
 
+          # set sarplus_version
+          SARPLUS_VERSION=$(cat "${SARPLUS_ROOT}/VERSION")
+          echo "sarplus_version=${SARPLUS_VERSION}" >> $GITHUB_ENV
+
       - name: Test
         run: |
           cd "${PYTHON_ROOT}"
-          python -m pip install dist/*.gz
+          python -m pip install dist/*.tar.gz
 
           cd "${SCALA_ROOT}"
           export SPARK_VERSION=$(python -m pip show pyspark | grep -i version | cut -d ' ' -f 2)
@@ -75,14 +81,13 @@ jobs:
 
           cd "${PYTHON_ROOT}"
           pytest ./tests
-          echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV
 
       - name: Upload Python package as GitHub artifact
         if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10'
         uses: actions/upload-artifact@v2
         with:
           name: pysarplus-${{ env.sarplus_version }}
-          path: ${{ env.PYTHON_ROOT }}/dist/*.gz
+          path: ${{ env.PYTHON_ROOT }}/dist/*.tar.gz
 
   scala-test:
     # Test sarplus with different versions of Databricks runtime, 2 LTSs and 1
@@ -129,6 +134,8 @@ jobs:
         env:
           GPG_KEY: ${{ secrets.SARPLUS_GPG_PRI_KEY_ASC }}
         run: |
+          SARPLUS_VERSION=$(cat "${SARPLUS_ROOT}/VERSION")
+
           # generate artifacts
           cd "${SCALA_ROOT}"
           export SPARK_VERSION="3.1.2"
@@ -142,18 +149,24 @@ jobs:
           export HADOOP_VERSION="3.3.1"
           export SCALA_VERSION="2.12.14"
           sbt ++${SCALA_VERSION}! package
+          sbt ++${SCALA_VERSION}! packageDoc
+          sbt ++${SCALA_VERSION}! packageSrc
+          sbt ++${SCALA_VERSION}! makePom
 
           # sign with GPG
-          cd target/scala-2.12
+          cd "${SCALA_ROOT}/target/scala-2.12"
           gpg --import <(cat <<< "${GPG_KEY}")
           for file in {*.jar,*.pom}; do gpg -ab "${file}"; done
 
           # bundle
-          jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc
-          echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV
+          jar cvf sarplus-bundle_2.12-${SARPLUS_VERSION}.jar sarplus_*.jar sarplus_*.pom sarplus_*.asc
+          jar cvf sarplus-spark-3.2-plus-bundle_2.12-${SARPLUS_VERSION}.jar sarplus-spark*.jar sarplus-spark*.pom sarplus-spark*.asc
+
+          # set sarplus_version
+          echo "sarplus_version=${SARPLUS_VERSION}" >> $GITHUB_ENV
 
-      - name: Upload Scala bundle  as GitHub artifact
+      - name: Upload Scala bundle as GitHub artifact
         uses: actions/upload-artifact@v2
         with:
           name: sarplus-bundle_2.12-${{ env.sarplus_version }}
-          path: ${{ env.SCALA_ROOT }}/target/scala-2.12/sarplus-bundle_2.12-${{ env.sarplus_version }}.jar
+          path: ${{ env.SCALA_ROOT }}/target/scala-2.12/*bundle*.jar
@@ -1,72 +1,77 @@
 # Packaging
 
-For [databricks](https://databricks.com/) to properly install a [C++
-extension](https://docs.python.org/3/extending/building.html), one
-must take a detour through [pypi](https://pypi.org/).  Use
-[twine](https://github.com/pypa/twine) to upload the package to
-[pypi](https://pypi.org/).
-
-```bash
-# build dependencies
-python -m pip install -U build pip twine
-
-cd python
-cp ../VERSION ./pysarplus/  # version file
-python -m build --sdist
-python -m twine upload dist/*
-```
-
-On [Spark](https://spark.apache.org/) one can install all 3 components
-(C++, Python, Scala) in one pass by creating a [Spark
-Package](https://spark-packages.org/).  Steps to install
-
-1. Package and publish the [pip package](python/setup.py) (see above)
-2. Package the [Spark package](scala/build.sbt), which includes the
+Steps to package and publish (also described in
+[sarplus.yml](../../.github/workflows/sarplus.yml)):
+1. Package and publish the [pip package](python/setup.py).  For
+   [databricks](https://databricks.com/) to properly install a [C++
+   extension](https://docs.python.org/3/extending/building.html), one
+   must take a detour through [pypi](https://pypi.org/).  Use
+   [twine](https://github.com/pypa/twine) to upload the package to
+   [pypi](https://pypi.org/).
+
+   ```bash
+   # build dependencies
+   python -m pip install -U build pip twine
+
+   cd python
+   cp ../VERSION ./pysarplus/  # copy version file
+   python -m build --sdist
+   python -m twine upload dist/*
+   ```
+
+2. Package the [Scala package](scala/build.sbt), which includes the
    [Scala formatter](scala/src/main/scala/microsoft/sarplus) and
-   references the pip package (see below)
+   references the pip package.
+
+   ```bash
+   export SARPLUS_VERSION=$(cat VERSION)
+   export SPARK_VERSION="3.1.2"
+   export HADOOP_VERSION="2.7.4"
+   export SCALA_VERSION="2.12.10"
+   GPG_KEY="<gpg-private-key>"
+
+   # generate artifacts
+   cd scala
+   sbt ++${SCALA_VERSION}! package
+   sbt ++${SCALA_VERSION}! packageDoc
+   sbt ++${SCALA_VERSION}! packageSrc
+   sbt ++${SCALA_VERSION}! makePom
+
+   # generate the artifact (sarplus-*-spark32.jar) for Spark 3.2+
+   export SPARK_VERSION="3.2.0"
+   export HADOOP_VERSION="3.3.1"
+   export SCALA_VERSION="2.12.14"
+   sbt ++${SCALA_VERSION}! package
+   sbt ++${SCALA_VERSION}! packageDoc
+   sbt ++${SCALA_VERSION}! packageSrc
+   sbt ++${SCALA_VERSION}! makePom
+
+   # sign with GPG
+   cd target/scala-${SCALA_VERSION%.*}
+   gpg --import <(cat <<< "${GPG_KEY}")
+   for file in {*.jar,*.pom}; do gpg -ab "${file}"; done
+
+   # bundle
+   jar cvf sarplus-bundle_2.12-${SARPLUS_VERSION}.jar sarplus_*.jar sarplus_*.pom sarplus_*.asc
+   jar cvf sarplus-spark-3.2-plus-bundle_2.12-${SARPLUS_VERSION}.jar sarplus-spark*.jar sarplus-spark*.pom sarplus-spark*.asc
+   ```
+
+   where `SPARK_VERSION`, `HADOOP_VERSION`, `SCALA_VERSION` should be
+   customized as needed.
+
 3. Upload the zipped Scala package bundle to [Nexus Repository
    Manager](https://oss.sonatype.org/) through a browser (See [publish
    manul](https://central.sonatype.org/publish/publish-manual/)).
 
-```bash
-export SPARK_VERSION="3.1.2"
-export HADOOP_VERSION="2.7.4"
-export SCALA_VERSION="2.12.10"
-GPG_KEY="<gpg-private-key>"
-
-# generate artifacts
-cd scala
-sbt ++${SCALA_VERSION}! package
-sbt ++${SCALA_VERSION}! packageDoc
-sbt ++${SCALA_VERSION}! packageSrc
-sbt ++${SCALA_VERSION}! makePom
-
-# generate the artifact (sarplus-*-spark32.jar) for Spark 3.2+
-export SPARK_VERSION="3.2.0"
-export HADOOP_VERSION="3.3.1"
-export SCALA_VERSION="2.12.14"
-sbt ++${SCALA_VERSION}! package
-
-# sign with GPG
-cd target/scala-${SCALA_VERSION%.*}
-gpg --import <(cat <<< "${GPG_KEY}")
-for file in {*.jar,*.pom}; do gpg -ab "${file}"; done
-
-# bundle
-jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc
-```
-
-where `SPARK_VERSION`, `HADOOP_VERSION`, `SCALA_VERSION` should be
-customized as needed.
-
 
 ## Testing
 
 To test the python UDF + C++ backend
 
 ```bash
-# build dependencies
+# dependencies
 python -m pip install -U build pip twine
+python -m pip install -U flake8 pytest pytest-cov scikit-learn
 
 # build
 cd python
@@ -97,11 +102,17 @@ Extended Support, 7.3 LTS, 9.1 LTS, 10.0 and 10.1) on Azure Databricks
 Service.  However, there is a breaking change of
 [org/apache.spark.sql.execution.datasources.OutputWriter](https://github.com/apache/spark/blob/dc0fa1eef74238d745dabfdc86705b59d95b07e1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala#L74)
 on **Spark 3.2**, which adds an extra function `path()`, so an
-additional JAR file with the classifier `spark32` will be needed if
-running on Spark 3.2 (See above for packaging).
-
-Also, extra configurations are also required when running on Spark
-3.x:
+additional package called [Sarplus Spark 3.2
+Plus](https://search.maven.org/artifact/com.microsoft.sarplus/sarplus-spark-3-2-plus_2.12)
+(with Maven coordinate such as
+`com.microsoft.sarplus:sarplus-spark-3-2-plus_2.12:0.5.4`) should be
+used if running on Spark 3.2 instead of
+[Sarplus](https://search.maven.org/artifact/com.microsoft.sarplus/sarplus_2.12)
+(with Maven coordinate like
+`com.microsoft.sarplus:sarplus_2.12:0.5.4`).
+
+In addition to `spark.sql.crossJoin.enabled true`, extra
+configurations are required when running on Spark 3.x:
 
 ```
 spark.sql.sources.default parquet