[SPARK-37343][SQL] Implement createIndex, IndexExists and dropIndex in JDBC (Postgres dialect)

dchvn · chenzhx · commit 40cb28fb2c7b · 2022-03-30T23:56:06.000+08:00
### What changes were proposed in this pull request? Implementing `createIndex`/`IndexExists`/`dropIndex` in DS V2 JDBC for Postgres dialect. ### Why are the changes needed? This is a subtask of the V2 Index support. This PR implements `createIndex`, `IndexExists` and `dropIndex`. After review for some changes in this PR, I will create new PR for `listIndexs`, or add it in this PR. This PR only implements `createIndex`, `IndexExists` and `dropIndex` in Postgres dialect. ### Does this PR introduce _any_ user-facing change? Yes, `createIndex`/`IndexExists`/`dropIndex` in DS V2 JDBC ### How was this patch tested? New test. Closes apache#34673 from dchvn/Dsv2_index_postgres. Authored-by: dch nguyen <dgd_contributor@viettel.com.vn> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
@@ -18,24 +18,21 @@
 package org.apache.spark.sql.jdbc.v2
 
 import java.sql.{Connection, SQLFeatureNotSupportedException}
-import java.util
 
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.connector.catalog.index.SupportsIndex
-import org.apache.spark.sql.connector.expressions.{FieldReference, NamedReference}
 import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog
 import org.apache.spark.sql.jdbc.{DatabaseOnDocker, DockerJDBCIntegrationSuite}
 import org.apache.spark.sql.types._
 import org.apache.spark.tags.DockerTest
 
 /**
  *
- * To run this test suite for a specific version (e.g., mysql:5.7.31):
+ * To run this test suite for a specific version (e.g., mysql:5.7.36):
  * {{{
- *   ENABLE_DOCKER_INTEGRATION_TESTS=1 MYSQL_DOCKER_IMAGE_NAME=mysql:5.7.31
+ *   ENABLE_DOCKER_INTEGRATION_TESTS=1 MYSQL_DOCKER_IMAGE_NAME=mysql:5.7.36
  *     ./build/sbt -Pdocker-integration-tests "testOnly *v2*MySQLIntegrationSuite"
  *
  * }}}
@@ -45,7 +42,7 @@ import org.apache.spark.tags.DockerTest
 class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest {
   override val catalogName: String = "mysql"
   override val db = new DatabaseOnDocker {
-    override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:5.7.31")
+    override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:5.7.36")
     override val env = Map(
       "MYSQL_ROOT_PASSWORD" -> "rootpass"
     )
@@ -121,20 +118,5 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest {
 
   override def supportsIndex: Boolean = true
 
-  override def testIndexProperties(jdbcTable: SupportsIndex): Unit = {
-    val properties = new util.Properties();
-    properties.put("KEY_BLOCK_SIZE", "10")
-    properties.put("COMMENT", "'this is a comment'")
-    // MySQL doesn't allow property set on individual column, so use empty Array for
-    // column properties
-    jdbcTable.createIndex("i1", "BTREE", Array(FieldReference("col1")),
-      Array.empty[util.Map[NamedReference, util.Properties]], properties)
-
-    var index = jdbcTable.listIndexes()
-    // The index property size is actually 1. Even though the index is created
-    // with properties "KEY_BLOCK_SIZE", "10" and "COMMENT", "'this is a comment'", when
-    // retrieving index using `SHOW INDEXES`, MySQL only returns `COMMENT`.
-    assert(index(0).properties.size == 1)
-    assert(index(0).properties.get("COMMENT").equals("this is a comment"))
-  }
+  override def indexOptions: String = "KEY_BLOCK_SIZE=10"
 }
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala
@@ -80,4 +80,8 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTes
   }
 
   override def supportsTableSample: Boolean = true
+
+  override def supportsIndex: Boolean = true
+
+  override def indexOptions: String = "FILLFACTOR=70"
 }
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.sql.jdbc.v2
 
-import java.util
-
 import org.apache.log4j.Level
 
 import org.apache.spark.sql.{AnalysisException, DataFrame}
+import org.apache.spark.sql.catalyst.analysis.{IndexAlreadyExistsException, NoSuchIndexException}
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, Sample}
 import org.apache.spark.sql.connector.catalog.{Catalogs, Identifier, TableCatalog}
 import org.apache.spark.sql.connector.catalog.index.SupportsIndex
@@ -191,91 +190,56 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu
   }
 
   def supportsIndex: Boolean = false
-  def testIndexProperties(jdbcTable: SupportsIndex): Unit = {}
 
-  test("SPARK-36913: Test INDEX") {
+  def indexOptions: String = ""
+
+  test("SPARK-36895: Test INDEX Using SQL") {
     if (supportsIndex) {
       withTable(s"$catalogName.new_table") {
         sql(s"CREATE TABLE $catalogName.new_table(col1 INT, col2 INT, col3 INT," +
-          s" col4 INT, col5 INT)")
+          " col4 INT, col5 INT)")
         val loaded = Catalogs.load(catalogName, conf)
         val jdbcTable = loaded.asInstanceOf[TableCatalog]
           .loadTable(Identifier.of(Array.empty[String], "new_table"))
           .asInstanceOf[SupportsIndex]
         assert(jdbcTable.indexExists("i1") == false)
         assert(jdbcTable.indexExists("i2") == false)
 
-        val properties = new util.Properties();
         val indexType = "DUMMY"
         var m = intercept[UnsupportedOperationException] {
-          jdbcTable.createIndex("i1", indexType, Array(FieldReference("col1")),
-            Array.empty[util.Map[NamedReference, util.Properties]], properties)
+          sql(s"CREATE index i1 ON $catalogName.new_table USING $indexType (col1)")
         }.getMessage
         assert(m.contains(s"Index Type $indexType is not supported." +
-          s" The supported Index Types are: BTREE and HASH"))
-
-        jdbcTable.createIndex("i1", "BTREE", Array(FieldReference("col1")),
-          Array.empty[util.Map[NamedReference, util.Properties]], properties)
+          s" The supported Index Types are:"))
 
-        jdbcTable.createIndex("i2", "",
-          Array(FieldReference("col2"), FieldReference("col3"), FieldReference("col5")),
-          Array.empty[util.Map[NamedReference, util.Properties]], properties)
+        sql(s"CREATE index i1 ON $catalogName.new_table USING BTREE (col1)")
+        sql(s"CREATE index i2 ON $catalogName.new_table (col2, col3, col5)" +
+          s" OPTIONS ($indexOptions)")
 
         assert(jdbcTable.indexExists("i1") == true)
         assert(jdbcTable.indexExists("i2") == true)
 
+        // This should pass without exception
+        sql(s"CREATE index IF NOT EXISTS i1 ON $catalogName.new_table (col1)")
+
         m = intercept[IndexAlreadyExistsException] {
-          jdbcTable.createIndex("i1", "", Array(FieldReference("col1")),
-            Array.empty[util.Map[NamedReference, util.Properties]], properties)
+          sql(s"CREATE index i1 ON $catalogName.new_table (col1)")
         }.getMessage
-        assert(m.contains("Failed to create index: i1 in new_table"))
-
-        var index = jdbcTable.listIndexes()
-        assert(index.length == 2)
-
-        assert(index(0).indexName.equals("i1"))
-        assert(index(0).indexType.equals("BTREE"))
-        var cols = index(0).columns
-        assert(cols.length == 1)
-        assert(cols(0).describe().equals("col1"))
-        assert(index(0).properties.size == 0)
-
-        assert(index(1).indexName.equals("i2"))
-        assert(index(1).indexType.equals("BTREE"))
-        cols = index(1).columns
-        assert(cols.length == 3)
-        assert(cols(0).describe().equals("col2"))
-        assert(cols(1).describe().equals("col3"))
-        assert(cols(2).describe().equals("col5"))
-        assert(index(1).properties.size == 0)
-
-        jdbcTable.dropIndex("i1")
-        assert(jdbcTable.indexExists("i1") == false)
-        assert(jdbcTable.indexExists("i2") == true)
-
-        index = jdbcTable.listIndexes()
-        assert(index.length == 1)
+        assert(m.contains("Failed to create index i1 in new_table"))
 
-        assert(index(0).indexName.equals("i2"))
-        assert(index(0).indexType.equals("BTREE"))
-        cols = index(0).columns
-        assert(cols.length == 3)
-        assert(cols(0).describe().equals("col2"))
-        assert(cols(1).describe().equals("col3"))
-        assert(cols(2).describe().equals("col5"))
+        sql(s"DROP index i1 ON $catalogName.new_table")
+        sql(s"DROP index i2 ON $catalogName.new_table")
 
-        jdbcTable.dropIndex("i2")
         assert(jdbcTable.indexExists("i1") == false)
         assert(jdbcTable.indexExists("i2") == false)
-        index = jdbcTable.listIndexes()
-        assert(index.length == 0)
+
+        // This should pass without exception
+        sql(s"DROP index IF EXISTS i1 ON $catalogName.new_table")
 
         m = intercept[NoSuchIndexException] {
-          jdbcTable.dropIndex("i2")
+          sql(s"DROP index i1 ON $catalogName.new_table")
         }.getMessage
-        assert(m.contains("Failed to drop index: i2"))
-
-        testIndexProperties(jdbcTable)
+        assert(m.contains("Failed to drop index i1 in new_table"))
       }
     }
   }
@@ -338,23 +302,23 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu
         assert(samplePushed(df3))
         assert(limitPushed(df3, 2))
         assert(columnPruned(df3, "col1"))
-        assert(df3.collect().length == 2)
+        assert(df3.collect().length <= 2)
 
         // sample(... PERCENT) push down + limit push down + column pruning
         val df4 = sql(s"SELECT col1 FROM $catalogName.new_table" +
           " TABLESAMPLE (50 PERCENT) REPEATABLE (12345) LIMIT 2")
         assert(samplePushed(df4))
         assert(limitPushed(df4, 2))
         assert(columnPruned(df4, "col1"))
-        assert(df4.collect().length == 2)
+        assert(df4.collect().length <= 2)
 
         // sample push down + filter push down + limit push down
         val df5 = sql(s"SELECT * FROM $catalogName.new_table" +
           " TABLESAMPLE (BUCKET 6 OUT OF 10) WHERE col1 > 0 LIMIT 2")
         assert(samplePushed(df5))
         assert(filterPushed(df5))
         assert(limitPushed(df5, 2))
-        assert(df5.collect().length == 2)
+        assert(df5.collect().length <= 2)
 
         // sample + filter + limit + column pruning
         // sample pushed down, filer/limit not pushed down, column pruned
@@ -365,7 +329,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu
         assert(!filterPushed(df6))
         assert(!limitPushed(df6, 2))
         assert(columnPruned(df6, "col1"))
-        assert(df6.collect().length == 2)
+        assert(df6.collect().length <= 2)
 
         // sample + limit
         // Push down order is sample -> filter -> limit
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -23,6 +23,8 @@ import java.util
 import java.util.Locale
 import java.util.concurrent.TimeUnit
 
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 import scala.util.control.NonFatal
 
@@ -38,7 +40,7 @@ import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateToDays, toJavaDate, toJavaTimestamp}
 import org.apache.spark.sql.connector.catalog.TableChange
-import org.apache.spark.sql.connector.catalog.index.TableIndex
+import org.apache.spark.sql.connector.catalog.index.{SupportsIndex, TableIndex}
 import org.apache.spark.sql.connector.expressions.NamedReference
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.execution.datasources.jdbc.connection.ConnectionProvider
@@ -1084,6 +1086,73 @@ object JdbcUtils extends Logging {
     }
   }
 
+  /**
+   * Check if index exists in a table
+   */
+  def checkIfIndexExists(
+      conn: Connection,
+      sql: String,
+      options: JDBCOptions): Boolean = {
+    val statement = conn.createStatement
+    try {
+      statement.setQueryTimeout(options.queryTimeout)
+      val rs = statement.executeQuery(sql)
+      rs.next
+    } catch {
+      case _: Exception =>
+        logWarning("Cannot retrieved index info.")
+        false
+    } finally {
+      statement.close()
+    }
+  }
+
+  /**
+   * Process index properties and return tuple of indexType and list of the other index properties.
+   */
+  def processIndexProperties(
+      properties: util.Map[String, String],
+      catalogName: String): (String, Array[String]) = {
+    var indexType = ""
+    val indexPropertyList: ArrayBuffer[String] = ArrayBuffer[String]()
+    val supportedIndexTypeList = getSupportedIndexTypeList(catalogName)
+
+    if (!properties.isEmpty) {
+      properties.asScala.foreach { case (k, v) =>
+        if (k.equals(SupportsIndex.PROP_TYPE)) {
+          if (containsIndexTypeIgnoreCase(supportedIndexTypeList, v)) {
+            indexType = s"USING $v"
+          } else {
+            throw new UnsupportedOperationException(s"Index Type $v is not supported." +
+              s" The supported Index Types are: ${supportedIndexTypeList.mkString(" AND ")}")
+          }
+        } else {
+          indexPropertyList.append(s"$k = $v")
+        }
+      }
+    }
+    (indexType, indexPropertyList.toArray)
+  }
+
+  def containsIndexTypeIgnoreCase(supportedIndexTypeList: Array[String], value: String): Boolean = {
+    if (supportedIndexTypeList.isEmpty) {
+      throw new UnsupportedOperationException(
+        "Cannot specify 'USING index_type' in 'CREATE INDEX'")
+    }
+    for (indexType <- supportedIndexTypeList) {
+      if (value.equalsIgnoreCase(indexType)) return true
+    }
+    false
+  }
+
+  def getSupportedIndexTypeList(catalogName: String): Array[String] = {
+    catalogName match {
+      case "mysql" => Array("BTREE", "HASH")
+      case "postgresql" => Array("BTREE", "HASH", "BRIN")
+      case _ => Array.empty
+    }
+  }
+
   def executeQuery(conn: Connection, options: JDBCOptions, sql: String): ResultSet = {
     val statement = conn.createStatement
     try {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala
@@ -21,8 +21,6 @@ import java.sql.{Connection, SQLException, Types}
 import java.util
 import java.util.Locale
 
-import scala.collection.JavaConverters._
-
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.SQLConfHelper
 import org.apache.spark.sql.catalyst.analysis.{IndexAlreadyExistsException, NoSuchIndexException}
@@ -115,32 +113,17 @@ private case object MySQLDialect extends JdbcDialect with SQLConfHelper {
   // https://dev.mysql.com/doc/refman/8.0/en/create-index.html
   override def createIndex(
       indexName: String,
-      indexType: String,
       tableName: String,
       columns: Array[NamedReference],
       columnsProperties: Array[util.Map[NamedReference, util.Properties]],
       properties: util.Properties): String = {
     val columnList = columns.map(col => quoteIdentifier(col.fieldNames.head))
-    var indexProperties: String = ""
-    val scalaProps = properties.asScala
-    if (!properties.isEmpty) {
-      scalaProps.foreach { case (k, v) =>
-        indexProperties = indexProperties + " " + s"$k $v"
-      }
-    }
-    val iType = if (indexType.isEmpty) {
-      ""
-    } else {
-      if (indexType.length > 1 && !indexType.equalsIgnoreCase("BTREE") &&
-        !indexType.equalsIgnoreCase("HASH")) {
-        throw new UnsupportedOperationException(s"Index Type $indexType is not supported." +
-          " The supported Index Types are: BTREE and HASH")
-      }
-      s"USING $indexType"
-    }
+    val (indexType, indexPropertyList) = JdbcUtils.processIndexProperties(properties, "mysql")
+
     // columnsProperties doesn't apply to MySQL so it is ignored
-    s"CREATE INDEX ${quoteIdentifier(indexName)} $iType ON" +
-      s" ${quoteIdentifier(tableName)} (${columnList.mkString(", ")}) $indexProperties"
+    s"CREATE INDEX ${quoteIdentifier(indexName)} $indexType ON" +
+      s" ${quoteIdentifier(tableName)} (${columnList.mkString(", ")})" +
+      s" ${indexPropertyList.mkString(" ")}"
   }
 
   // SHOW INDEX syntax
@@ -150,21 +133,8 @@ private case object MySQLDialect extends JdbcDialect with SQLConfHelper {
       indexName: String,
       tableName: String,
       options: JDBCOptions): Boolean = {
-    val sql = s"SHOW INDEXES FROM ${quoteIdentifier(tableName)}"
-    try {
-      val rs = JdbcUtils.executeQuery(conn, options, sql)
-      while (rs.next()) {
-        val retrievedIndexName = rs.getString("key_name")
-        if (conf.resolver(retrievedIndexName, indexName)) {
-          return true
-        }
-      }
-      false
-    } catch {
-      case _: Exception =>
-        logWarning("Cannot retrieved index info.")
-        false
-    }
+    val sql = s"SHOW INDEXES FROM ${quoteIdentifier(tableName)} WHERE key_name = '$indexName'"
+    JdbcUtils.checkIfIndexExists(conn, sql, options)
   }
 
   override def dropIndex(indexName: String, tableName: String): String = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala

Original file line number	Diff line number	Diff line change
`@@ -80,4 +80,8 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTes`
`80`	`80`	`}`
`81`	`81`
`82`	`82`	`override def supportsTableSample: Boolean = true`
	`83`	`+`
	`84`	`+ override def supportsIndex: Boolean = true`
	`85`	`+`
	`86`	`+ override def indexOptions: String = "FILLFACTOR=70"`
`83`	`87`	`}`