diff --git a/.gitignore b/.gitignore index cf3861c..ca418ee 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ target tf-sandbox spark-warehouse/ metastore_db/ +project/project/ +test-output.tfr \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..b261cc2 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,21 @@ +language: scala + +# Cache settings here are based on latest SBT documentation. +cache: + directories: + - $HOME/.ivy2/cache + - $HOME/.sbt/boot/ + +before_cache: + # Tricks to avoid unnecessary cache updates + - find $HOME/.ivy2 -name "ivydata-*.properties" -delete + - find $HOME/.sbt -name "*.lock" -delete + +scala: + - 2.11.8 + +jdk: + - oraclejdk8 + +script: + - sbt ++$TRAVIS_SCALA_VERSION clean publish-local diff --git a/README.md b/README.md index 2d0fdac..24fa21a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Build Status](https://travis-ci.org/tapanalyticstoolkit/spark-tensorflow-connector.svg?branch=sbt)](https://travis-ci.org/tapanalyticstoolkit/spark-tensorflow-connector) + # spark-tensorflow-connector This repo contains a library for loading and storing TensorFlow records with [Apache Spark](http://spark.apache.org/). @@ -19,19 +21,34 @@ None. 2. [Apache Maven](https://maven.apache.org/) ## Building the library -Build the library using Maven as shown below. +You can build library using both Maven and SBT build tools + +#### Maven +Build the library using Maven(3.3) as shown below ```sh mvn clean install ``` +#### SBT +Build the library using SBT(0.13.13) as show below +```sh +sbt clean assembly +``` + ## Using Spark Shell Run this library in Spark using the `--jars` command line option in `spark-shell` or `spark-submit`. For example: +Maven Jars ```sh $SPARK_HOME/bin/spark-shell --jars target/spark-tensorflow-connector-1.0-SNAPSHOT.jar,target/lib/tensorflow-hadoop-1.0-01232017-SNAPSHOT-shaded-protobuf.jar ``` +SBT Jars +```sh +$SPARK_HOME/bin/spark-shell --jars target/scala-2.11/spark-tensorflow-connector-assembly-1.0-SNAPSHOT.jar +``` + The following code snippet demonstrates usage. ```scala @@ -40,7 +57,7 @@ import org.apache.spark.sql.{ DataFrame, Row } import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types._ -val path = s"$TF_SANDBOX_DIR/test-output.tfr" +val path = "test-output.tfr" val testRows: Array[Row] = Array( new GenericRow(Array[Any](11, 1, 23L, 10.0F, 14.0, List(1.0, 2.0), "r1")), new GenericRow(Array[Any](21, 2, 24L, 12.0F, 15.0, List(2.0, 2.0), "r2"))) diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..c4ead37 --- /dev/null +++ b/build.sbt @@ -0,0 +1,52 @@ +scalaVersion in Global := "2.11.8" + +def ProjectName(name: String,path:String): Project = Project(name, file(path)) + +resolvers in Global ++= Seq("https://tap.jfrog.io/tap/public" at "https://tap.jfrog.io/tap/public" , + "https://tap.jfrog.io/tap/public-snapshots" at "https://tap.jfrog.io/tap/public-snapshots" , + "https://repo.maven.apache.org/maven2" at "https://repo.maven.apache.org/maven2" ) + +val `junit_junit` = "junit" % "junit" % "4.12" + +val `org.apache.hadoop_hadoop-yarn-api` = "org.apache.hadoop" % "hadoop-yarn-api" % "2.7.3" + +val `org.apache.spark_spark-core_2.11` = "org.apache.spark" % "spark-core_2.11" % "2.1.0" + +val `org.apache.spark_spark-sql_2.11` = "org.apache.spark" % "spark-sql_2.11" % "2.1.0" + +val `org.apache.spark_spark-mllib_2.11` = "org.apache.spark" % "spark-mllib_2.11" % "2.1.0" + +val `org.scalatest_scalatest_2.11` = "org.scalatest" % "scalatest_2.11" % "2.2.6" + +val `org.tensorflow_tensorflow-hadoop` = "org.tensorflow" % "tensorflow-hadoop" % "1.0-01232017-SNAPSHOT" + + +spName := "spark-tensorflow-connector" + +sparkVersion := "2.1.0" + +sparkComponents ++= Seq("sql", "mllib") + +spIgnoreProvided := true + +version := "1.0-SNAPSHOT" + +name := "spark-tensorflow-connector" + +organization := "org.trustedanalytics" + +libraryDependencies in Global ++= Seq(`org.tensorflow_tensorflow-hadoop` classifier "shaded-protobuf", + `org.scalatest_scalatest_2.11` % "test" , + `org.apache.spark_spark-sql_2.11` % "provided" , + `org.apache.spark_spark-mllib_2.11` % "test" classifier "tests", + `org.apache.spark_spark-core_2.11` % "provided" , + `org.apache.hadoop_hadoop-yarn-api` % "provided" , + `junit_junit` % "test" ) + +assemblyExcludedJars in assembly := { + val cp = (fullClasspath in assembly).value + cp filterNot {x => List("spark-tensorflow-connector-1.0-SNAPSHOT.jar", + "tensorflow-hadoop-1.0-01232017-SNAPSHOT-shaded-protobuf.jar").contains(x.data.getName)} +} + +licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..5f32afe --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=0.13.13 \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..0536c57 --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,5 @@ +resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/" + +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") + +addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.5")