Prepare for Release 0.3.7 (#180)

* [sbt] version updates * [sbt] disable build for scala 2.12 * [conf] allow not_analyzed string fields (#145) * [not-analyzed-fields] do not analyzed fields ending with _notanalyzed * [sbt] version updates * [sbt] disable build for scala 2.12 * [conf] allow not_analyzed string fields (#145) * [not-analyzed-fields] do not analyzed fields ending with _notanalyzed * Revert "Revert "Setting version to 0.3.5-SNAPSHOT"" This reverts commit a6da0af. * [build] update Lucene to 7.7.0 * Hotfix: issue 150 (#151) * Remove unused code (#141) * Revert "Setting version to 0.3.4-SNAPSHOT" This reverts commit 2f1d7be. * README: update to 0.3.3 * README: fix javadoc badge * remove unused param * [sbt] version updates * [conf] allow not_analyzed string fields (#145) * [not-analyzed-fields] do not analyzed fields ending with _notanalyzed * [hotfix] fixes issue 150 * [tests] issue 150 * fix typo * [blockEntityLinkage] drop queryPartColumns * [sbt] version updates * [scripts] fix shell * Block linkage: allow a block linker with Row to Query (#154) * [linkage] block linker with => Query * [linkage] block linker is Row => Query * remove Query analyzer on methods * [sbt] set version to 0.3.6-SNAPSHOT * Feature: allow custom analyzers during compile time (#160) * [analyzers] custom analyzer * test return null * [travis] travis_wait 1 min * Revert "[travis] travis_wait 1 min" This reverts commit c79456e. * use lucene examples * custom analyzer return null * fix java reflection * add docs * Update to Lucene 8 (#161) * [lucene] upgrade to version 8.0.0 * [lucene] remove ngram analyzer * delete ngram analyzer * minor fix * add scaladoc * LuceneRDDResponseSpec.collect() should work when no results are found - Issue #166 (#168) * [sbt] update scalatest 3.0.7 * [sbt] update spark 2.4.1 * [build.sbt] add credentials file * [plugins] update versions * [sbt] update to 0.13.18 * Allow Lucene Analyzers per field (#164) * [issue_163] per field analysis * [sbt] update scalatest to 3.0.7 * [issue_163] fix docs; order of arguments * fixes on ShapeLuceneRDD * [issue_163] fix test * issue_163: minor fix * introduce LuceneRDDParams case class * fix apply in LuceneRDDParams * [issue_163] remove duplicate apply defn * add extra LuceneRDD.apply * [issue_165] throw runtime exception; use traversable trait (#170) [issue_165] throw runtime exception; handle multi-valued fields in DataFrames * [config] refactor; add environment variables in config (#173) * [refactor] configuration loading * [travis] code hygiene * Make LuceneRDDResponse to extend RDD[Row] (#175) * WIP * fix tests * remove SparkDoc class * make test compile * use GenericRowWithSchema * tests: getDouble score * score is a float * fix casting issue with Seq[String] * tests: LuceneDocToSparkRowpec * tests: LuceneDocToSparkRowpec * more tests * LuceneDocToSparkRowpec: more tests * LuceneDocToSparkRowpec: fix tests * LuceneDocToSparkRow: fix Number type inference * LuceneDocToSparkRowpec: fix tests * implicits: remove StoredField for Numeric types * implicits: revert remove StoredField for Numeric types * fix more tests * fix more tests * [tests] fix LuceneRDDResponse .toDF() * fix multivalued fields * fix score type issue * minor * stored fields for numerics * hotfix: TextField must be stored using StoredField * hotfix: stringToDocument implicit * link issue 179 * fix tests * remove _.toRow() calls * fix compile issue * [sbt] update to spark 2.4.2 * [travis] use spark 2.4.2
zouzias · Apr 26, 2019 · ef52f2c · ef52f2c
1 parent bfa3743
commit ef52f2c
Show file tree

Hide file tree

Showing 39 changed files with 982 additions and 582 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,6 @@
 language: scala
+scala:
+  - 2.11.12
 sudo: false
 dist: trusty
 cache:
@@ -15,22 +17,16 @@ before_cache:
 matrix:
   include:
     - jdk: oraclejdk8
-      scala: 2.11.12
-      env: TEST_SPARK_VERSION="2.4.0" LUCENE_ANALYZER="en" LINKER_METHOD="cartesian"
+      env: TEST_SPARK_VERSION="2.4.2" LUCENERDD_ANALYZER_NAME="en" LUCENERDD_LINKER_METHOD="cartesian"
     - jdk: openjdk8
-      scala: 2.11.12
-      env: TEST_SPARK_VERSION="2.4.0" LUCENE_ANALYZER="en" LINKER_METHOD="collectbroadcast"
+      env: TEST_SPARK_VERSION="2.4.2" LUCENERDD_ANALYZER_NAME="en" LUCENERDD_LINKER_METHOD="collectbroadcast"
     - jdk: openjdk8
-      scala: 2.11.12
-      env: TEST_SPARK_VERSION="2.4.0" LUCENE_ANALYZER="whitespace" LINKER_METHOD="cartesian"
+      env: TEST_SPARK_VERSION="2.4.2" LUCENERDD_ANALYZER_NAME="whitespace" LUCENERDD_LINKER_METHOD="cartesian"
     - jdk: oraclejdk8
-      scala: 2.11.12
-      env: TEST_SPARK_VERSION="2.4.0" LUCENE_ANALYZER="whitespace" LINKER_METHOD="collectbroadcast"
+      env: TEST_SPARK_VERSION="2.4.2" LUCENERDD_ANALYZER_NAME="whitespace" LUCENERDD_LINKER_METHOD="collectbroadcast"
 script:
   - sbt ++$TRAVIS_SCALA_VERSION clean update
-      -Dlucenerdd.linker.method=${LINKER_METHOD}
-      -Dlucenerdd.spatial.linker.method=${LINKER_METHOD}
-      -Dlucenerdd.analyzer.name=${LUCENE_ANALYZER} -test
+      -Dlucenerdd.spatial.linker.method=${LUCENE_SPATIAL_LINKER_METHOD} -test
   - sbt ++$TRAVIS_SCALA_VERSION scalastyle
   - sbt ++$TRAVIS_SCALA_VERSION assembly
   - travis_wait 30 sbt ++$TRAVIS_SCALA_VERSION clean coverage test coverageReport

diff --git a/build.sbt b/build.sbt
@@ -77,10 +77,12 @@ pomExtra := <scm>
     </developer>
   </developers>
 
-val luceneV = "7.7.1"
+credentials += Credentials(Path.userHome / ".sbt" / ".credentials")
+
+val luceneV = "8.0.0"
 
 spName := "zouzias/spark-lucenerdd"
-sparkVersion := "2.4.0"
+sparkVersion := "2.4.2"
 spShortDescription := "Spark RDD with Lucene's query capabilities"
 sparkComponents ++= Seq("core", "sql", "mllib")
 spAppendScalaVersion := true
@@ -94,8 +96,8 @@ testSparkVersion := sys.props.get("spark.testVersion").getOrElse(sparkVersion.va
 
 
 // scalastyle:off
-val scalactic                 = "org.scalactic"                  %% "scalactic"                % "3.0.6"
-val scalatest                 = "org.scalatest"                  %% "scalatest"                % "3.0.6" % "test"
+val scalactic                 = "org.scalactic"                  %% "scalactic"                % "3.0.7"
+val scalatest                 = "org.scalatest"                  %% "scalatest"                % "3.0.7" % "test"
 
 val joda_time                 = "joda-time"                      % "joda-time"                 % "2.10.1"
 val algebird                  = "com.twitter"                    %% "algebird-core"            % "0.13.5"

diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version=0.13.17
+sbt.version=0.13.18
diff --git a/project/buildinfo.sbt b/project/buildinfo.sbt
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -17,11 +17,13 @@
 
 resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/"
 
-addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.3.0")
+addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.7.0")
 
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.7")
+addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.4.0")
 
-addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.9")
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
+
+addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11")
 
 addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
 

diff --git a/src/main/resources/reference.conf b/src/main/resources/reference.conf
@@ -2,15 +2,12 @@ lucenerdd {
 
   // Name of analyzer as it is under Lucene's package org.apache.lucene.analysis.XX
   analyzer.name = "en"
+  analyzer.name=${?LUCENERDD_ANALYZER_NAME}
 
-  // Analyzer name must be "ngram"
-  analyzer {
-    ngram.mingram = 2
-    ngram.maxgram = 5
-  }
-
+
   // Similarity scoring for Lucenes
   similarity.name = "bm25" // anything else will default to Lucene classic similarity
+  similarity.name = ${?LUCENERDD_SIMILARITY_NAME}
 
   // Supported linkage methods
   // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD
@@ -20,6 +17,7 @@ lucenerdd {
   // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of
   // partitions of the queries RDD.
   linker.method = "collectbroadcast"
+  linker.method = ${?LUCENERDD_LINKER_METHOD}
 
   index {
 
@@ -28,17 +26,20 @@ lucenerdd {
     // Otherwise the index will be stored in memory
     // Do not use memory, see http://lucene.apache.org/core/7_5_0/core/org/apache/lucene/store/RAMDirectory.html
     store.mode = "disk"
+    store.mode = ${?LUCENERDD_INDEX_STORE_MODE}
 
     stringfields{
 
       // Analyze string fields by default or not
       // Implicit fields, like _1, _2, etc will use this option
       analyzed = true
+      analyzed = ${?LUCENERDD_INDEX_STRINGFIELDS_ANALYZED}
 
       // Select a subset of string fields that you do not wish to be analyzed
       // Due to serialization issues this list should be set before starting a Spark Session
       // Moreover, all text/string fields that end with '_notanalyzed' are not analyzed
       not_analyzed_list = []
+      not_analyzed_list = ${?LUCENERDD_INDEX_STRINGFIELDS_NOT_ANALYZED_LIST}
 
       // Text fields options as in org.apache.lucene.index.IndexOptions
       //
@@ -49,36 +50,59 @@ lucenerdd {
       // "DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"
       // "NONE"
       options = "docs_and_freqs_and_positions_and_offsets"
+      options = ${?LUCENERDD_INDEX_STRINGFIELDS_OPTIONS}
 
-      // Omit terms norms
-      terms.omitnorms = false
+      terms {
+        // Omit terms norms
+        omitnorms = false
+        omitnorms = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_OMITNORMS}
 
-      // Store term positions
-      terms.positions = false
+        // Store term positions
+        positions = false
+        positions = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_POSITIONS}
 
-      // Store Term vectors (set true, otherwise LuceneRDD.termVectors(fieldName) will fail)
-      terms.vectors = true
+        // Store Term vectors (set true, otherwise LuceneRDD.termVectors(fieldName) will fail)
+        vectors = true
+        vectors = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_VECTORS}
+      }
     }
   }
 
-  // Maximum value on topK queries
-  query.topk.maxvalue = 100
-  // Default value of number of returned results
-  query.topk.default = 10
+  query {
+    // Maximum value on topK queries
+    topk.maxvalue = 100
+    topk.maxvalue = ${?LUCENERDD_QUERY_TOPK_MAXVALUE}
+
+    // Default value of number of returned results
+    topk.default = 10
+    topk.default = ${?LUCENERDD_QUERY_TOPK_DEFAULT}
 
-  // Default value of number of faceted results
-  query.facets.number.default = 10
+    // Default value of number of faceted results
+    facets.number.default = 10
+    facets.number.default = ${?LUCENERDD_QUERY_FACETS_NUMBER_DEFAULT}
+
+  }
 
   // Spatial related configurations used by ShapeLuceneRDD
   spatial {
     prefixtree {
+
+      // Spatial tree data structure
       name = "quad"  // "geohash" or "quad"
+      name = ${?LUCENE_SPATIAL_PREFIXTREE_NAME}
+
       maxlevel = 9 // 11 results in sub-meter precision for geohash
+      maxlevel = ${?LUCENE_SPATIAL_PREFIXTREE_MAXLEVEL}
+
+
       maxDistErr = 5.0 // in kilometers
+      maxDistErr = ${?LUCENE_SPATIAL_PREFIXTREE_MAXDISTERR}
+
     }
 
     // Shape format can be one of ShapeIO.GeoJSON, ShapeIO.LEGACY, ShapeIO.POLY, ShapeIO.WKT
     shape.io.format = "WKT"
+    shape.io.format = ${?LUCENE_SPATIAL_SHAPE_IO_FORMAT}
 
     // Supported linkage methods
     // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD
@@ -88,5 +112,6 @@ lucenerdd {
     // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of
     // partitions of the queries RDD.
     linker.method = "collectbroadcast"
+    linker.method = ${?LUCENE_SPATIAL_LINKER_METHOD}
   }
 }