-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-32512][SQL] add alter table add/drop partition command for datasourcev2 #29339
Changes from 2 commits
a57aafc
6efca68
f0bc357
61cae52
b1fc84b
67fcb12
f4a6ee3
9bd20ba
6327ead
800c51a
60b0a12
a4c29a2
a6caf68
3405f5b
9bc2e76
0740ef5
ad40d7b
8fce669
af4b50b
b046909
fbc2b58
dedb32a
41cf069
96a62be
d5d8f13
6bf49bd
cdd7085
0545538
f1fcac1
7014ba1
69bbbd5
effd0ed
7377469
c6711cd
dcd5060
d316e56
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.spark.sql.connector.catalog; | ||
|
||
import java.util.Map; | ||
|
||
import org.apache.spark.annotation.Experimental; | ||
import org.apache.spark.sql.catalyst.InternalRow; | ||
import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException; | ||
import org.apache.spark.sql.catalyst.analysis.PartitionAlreadyExistsException; | ||
import org.apache.spark.sql.types.StructType; | ||
|
||
/** | ||
* A partition interface of {@link Table}. | ||
* A partition is composed of identifier and properties, | ||
* and properties contains metadata information of the partition. | ||
* <p> | ||
* These APIs are used to modify table partition identifier or partition metadata. | ||
* In some cases, they will change the table data as well. | ||
* ${@link #createPartition}: | ||
* add a partition and any data that its location contains to the table | ||
* ${@link #dropPartition}: | ||
* remove a partition and any data it contains from the table | ||
* ${@link #replacePartitionMetadata}: | ||
* point a partition to a new location, which will swap one location's data for the other | ||
* | ||
* @since 3.1.0 | ||
*/ | ||
@Experimental | ||
public interface SupportsPartitions extends Table { | ||
|
||
/** | ||
* @return the partition schema of table | ||
*/ | ||
StructType partitionSchema(); | ||
|
||
/** | ||
* Create a partition in table. | ||
* | ||
* @param ident a new partition identifier | ||
* @param properties the metadata of a partition | ||
* @throws PartitionAlreadyExistsException If a partition already exists for the identifier | ||
* @throws UnsupportedOperationException If partition property is not supported | ||
*/ | ||
void createPartition( | ||
InternalRow ident, | ||
Map<String, String> properties) | ||
throws PartitionAlreadyExistsException, UnsupportedOperationException; | ||
|
||
/** | ||
* Drop a partition from table. | ||
* | ||
* @param ident a partition identifier | ||
* @return true if a partition was deleted, false if no partition exists for the identifier | ||
*/ | ||
boolean dropPartition(InternalRow ident); | ||
|
||
/** | ||
* Replace the partition metadata of the existing partition. | ||
* | ||
* @param ident the partition identifier of the existing partition | ||
* @param properties the new metadata of the partition | ||
* @throws NoSuchPartitionException If the partition identifier to alter doesn't exist | ||
* @throws UnsupportedOperationException If partition property is not supported | ||
*/ | ||
void replacePartitionMetadata( | ||
InternalRow ident, | ||
Map<String, String> properties) | ||
throws NoSuchPartitionException, UnsupportedOperationException; | ||
|
||
/** | ||
* Retrieve the partition metadata of the existing partition. | ||
* | ||
* @param ident a partition identifier | ||
* @return the metadata of the partition | ||
* @throws UnsupportedOperationException If partition property is not supported | ||
*/ | ||
Map<String, String> loadPartitionMetadata(InternalRow ident) | ||
throws UnsupportedOperationException; | ||
|
||
/** | ||
* List the identifiers of all partitions that contains the ident in a table. | ||
* | ||
* @param ident a prefix of partition identifier | ||
* @return an array of Identifiers for the partitions | ||
*/ | ||
InternalRow[] listPartitionIdentifiers(InternalRow ident); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,10 @@ package org.apache.spark.sql.execution.datasources.v2 | |
import scala.collection.JavaConverters._ | ||
|
||
import org.apache.spark.sql.AnalysisException | ||
import org.apache.spark.sql.connector.catalog.{SupportsDelete, SupportsRead, SupportsWrite, Table, TableCapability} | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec | ||
import org.apache.spark.sql.connector.catalog.{SupportsDelete, SupportsPartitions, SupportsRead, SupportsWrite, Table, TableCapability} | ||
import org.apache.spark.sql.types.{ByteType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType, StructType} | ||
import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
|
||
object DataSourceV2Implicits { | ||
|
@@ -52,6 +55,15 @@ object DataSourceV2Implicits { | |
} | ||
} | ||
|
||
def asPartitionable: SupportsPartitions = { | ||
table match { | ||
case support: SupportsPartitions => | ||
support | ||
case _ => | ||
throw new AnalysisException(s"Table does not support partitions: ${table.name}") | ||
} | ||
} | ||
|
||
def supports(capability: TableCapability): Boolean = table.capabilities.contains(capability) | ||
|
||
def supportsAny(capabilities: TableCapability*): Boolean = capabilities.exists(supports) | ||
|
@@ -62,4 +74,31 @@ object DataSourceV2Implicits { | |
new CaseInsensitiveStringMap(options.asJava) | ||
} | ||
} | ||
|
||
def convertPartitionIndentifers( | ||
partSpec: TablePartitionSpec, | ||
partSchema: StructType): InternalRow = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this included with the implicits when it isn't an implicit class? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, a little ugly to me if it defined with implicits. I can change it if you think it's better with implicits. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it needs to be implicit. I just don't think it belongs in the implicits class if it isn't an implicit. I think there is a util class you could include this in. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I misunderstood. thanks. |
||
val partValues = partSchema.map { part => | ||
part.dataType match { | ||
case _: ByteType => | ||
partSpec.getOrElse(part.name, "0").toByte | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Conversion to I think this should require that all partition names are present in the map, and pass null if a name is present but does not have a value. If the partition doesn't allow null partition values, then it should throw an exception. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure. sounds reasonable to me. |
||
case _: ShortType => | ||
partSpec.getOrElse(part.name, "0").toShort | ||
case _: IntegerType => | ||
partSpec.getOrElse(part.name, "0").toInt | ||
case _: LongType => | ||
partSpec.getOrElse(part.name, "0").toLong | ||
case _: FloatType => | ||
partSpec.getOrElse(part.name, "0").toFloat | ||
case _: DoubleType => | ||
partSpec.getOrElse(part.name, "0").toDouble | ||
case _: StringType => | ||
partSpec.getOrElse(part.name, "") | ||
case _ => | ||
throw new AnalysisException( | ||
s"Type ${part.dataType.typeName} is not supported for partition.") | ||
} | ||
} | ||
InternalRow.fromSeq(partValues) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.connector | ||
|
||
import java.util | ||
import java.util.concurrent.ConcurrentHashMap | ||
|
||
import scala.collection.JavaConverters._ | ||
|
||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, PartitionAlreadyExistsException} | ||
import org.apache.spark.sql.connector.catalog.SupportsPartitions | ||
import org.apache.spark.sql.connector.expressions.Transform | ||
import org.apache.spark.sql.types.StructType | ||
|
||
|
||
/** | ||
* This class is used to test SupportsPartitions API. | ||
*/ | ||
class InMemoryPartitionTable( | ||
name: String, | ||
schema: StructType, | ||
partitioning: Array[Transform], | ||
properties: util.Map[String, String]) | ||
extends InMemoryTable(name, schema, partitioning, properties) with SupportsPartitions { | ||
import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ | ||
|
||
private val memoryTablePartitions: util.Map[InternalRow, util.Map[String, String]] = | ||
new ConcurrentHashMap[InternalRow, util.Map[String, String]]() | ||
|
||
def partitionSchema: StructType = { | ||
val partitionColumnNames = partitioning.toSeq.asPartitionColumns | ||
new StructType(schema.filter(p => partitionColumnNames.contains(p.name)).toArray) | ||
} | ||
|
||
def createPartition( | ||
ident: InternalRow, | ||
properties: util.Map[String, String]): Unit = { | ||
if (memoryTablePartitions.containsKey(ident)) { | ||
throw new PartitionAlreadyExistsException(name, ident, partitionSchema) | ||
} else { | ||
memoryTablePartitions.put(ident, properties) | ||
} | ||
} | ||
|
||
def dropPartition(ident: InternalRow): Boolean = { | ||
if (memoryTablePartitions.containsKey(ident)) { | ||
memoryTablePartitions.remove(ident) | ||
true | ||
} else { | ||
false | ||
} | ||
} | ||
|
||
def replacePartitionMetadata(ident: InternalRow, properties: util.Map[String, String]): Unit = { | ||
if (memoryTablePartitions.containsKey(ident)) { | ||
memoryTablePartitions.put(ident, properties) | ||
} else { | ||
throw new NoSuchPartitionException(name, ident, partitionSchema) | ||
} | ||
} | ||
|
||
def loadPartitionMetadata(ident: InternalRow): util.Map[String, String] = { | ||
if (memoryTablePartitions.containsKey(ident)) { | ||
memoryTablePartitions.get(ident) | ||
} else { | ||
throw new NoSuchPartitionException(name, ident, partitionSchema) | ||
} | ||
} | ||
|
||
def listPartitionIdentifiers(ident: InternalRow): Array[InternalRow] = { | ||
val prefixPartCols = | ||
new StructType(partitionSchema.dropRight(partitionSchema.length - ident.numFields).toArray) | ||
val prefixPart = ident.toSeq(prefixPartCols) | ||
memoryTablePartitions.keySet().asScala | ||
.filter(_.toSeq(partitionSchema).startsWith(prefixPart)).toArray | ||
} | ||
|
||
def partitionExists(ident: InternalRow): Boolean = { | ||
memoryTablePartitions.containsKey(ident) | ||
} | ||
|
||
def clearPartitions(): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can remove it now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes |
||
memoryTablePartitions.clear() | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the second
PARTITION
mandatory? Can you use similar format as forDROP
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeap, thanks