Skip to content

Commit

Permalink
Enabled CLI to read predefined and customized CSV files (#480)
Browse files Browse the repository at this point in the history
  • Loading branch information
lziq authored Jan 3, 2022
1 parent aaa5936 commit 5044be1
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 28 deletions.
2 changes: 2 additions & 0 deletions cli/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ dependencies {
implementation 'net.sf.jopt-simple:jopt-simple:[5.0,6.0)'
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'

implementation 'org.apache.commons:commons-csv:1.8'

testImplementation 'org.junit.vintage:junit-vintage-engine:5.7.0'
}

Expand Down
35 changes: 30 additions & 5 deletions cli/src/org/partiql/cli/functions/ReadFile.kt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package org.partiql.cli.functions

import com.amazon.ion.IonStruct
import org.apache.commons.csv.CSVFormat
import org.partiql.lang.eval.Environment
import org.partiql.lang.eval.ExprValue
import org.partiql.lang.eval.ExprValueFactory
Expand All @@ -34,14 +35,30 @@ internal class ReadFile(valueFactory: ExprValueFactory) : BaseFunction(valueFact
ConversionMode.values().find { it.name.toLowerCase() == name } ?:
throw IllegalArgumentException( "Unknown conversion: $name")

private fun delimitedReadHandler(delimiter: Char): (InputStream, IonStruct) -> ExprValue = { input, options ->
private fun fileReadHandler(csvFormat: CSVFormat): (InputStream, IonStruct) -> ExprValue = { input, options ->
val encoding = options["encoding"]?.stringValue() ?: "UTF-8"
val reader = InputStreamReader(input, encoding)
val conversion = options["conversion"]?.stringValue() ?: "none"

val hasHeader = options["header"]?.booleanValue() ?: false
val ignoreEmptyLine = options["ignore_empty_line"]?.booleanValue() ?: true
val ignoreSurroundingSpace = options["ignore_surrounding_space"]?.booleanValue() ?: true
val trim = options["trim"]?.booleanValue() ?: true
val delimiter = options["delimiter"]?.stringValue()?.first() // CSVParser library only accepts a single character as delimiter
val record = options["line_breaker"]?.stringValue()
val escape = options["escape"]?.stringValue()?.first() // CSVParser library only accepts a single character as escape
val quote = options["quote"]?.stringValue()?.first() // CSVParser library only accepts a single character as quote

val reader = InputStreamReader(input, encoding)
val csvFormatWithOptions = csvFormat.withIgnoreEmptyLines(ignoreEmptyLine)
.withIgnoreSurroundingSpaces(ignoreSurroundingSpace)
.withTrim(trim)
.let { if (hasHeader) it.withFirstRecordAsHeader() else it }
.let { if (delimiter != null) it.withDelimiter(delimiter) else it }
.let { if (record != null) it.withRecordSeparator(record) else it }
.let { if (escape != null) it.withEscape(escape) else it }
.let { if (quote != null) it.withQuote(quote) else it }

DelimitedValues.exprValue(valueFactory, reader, delimiter, hasHeader, conversionModeFor(conversion))
DelimitedValues.exprValue(valueFactory, reader, csvFormatWithOptions, conversionModeFor(conversion))
}

private fun ionReadHandler(): (InputStream, IonStruct) -> ExprValue = { input, _ ->
Expand All @@ -50,8 +67,16 @@ internal class ReadFile(valueFactory: ExprValueFactory) : BaseFunction(valueFact

private val readHandlers = mapOf(
"ion" to ionReadHandler(),
"tsv" to delimitedReadHandler('\t'),
"csv" to delimitedReadHandler(','))
"csv" to fileReadHandler(CSVFormat.DEFAULT),
"tsv" to fileReadHandler(CSVFormat.DEFAULT.withDelimiter('\t')),
"excel_csv" to fileReadHandler(CSVFormat.EXCEL),
"mysql_csv" to fileReadHandler(CSVFormat.MYSQL),
"mongodb_csv" to fileReadHandler(CSVFormat.MONGODB_CSV),
"mongodb_tsv" to fileReadHandler(CSVFormat.MONGODB_TSV),
"postgresql_csv" to fileReadHandler(CSVFormat.POSTGRESQL_CSV),
"postgresql_text" to fileReadHandler(CSVFormat.POSTGRESQL_TEXT),
"customized" to fileReadHandler(CSVFormat.DEFAULT)
)

override fun call(env: Environment, args: List<ExprValue>): ExprValue {
val options = optionsStruct(1, args)
Expand Down
96 changes: 96 additions & 0 deletions cli/test/org/partiql/cli/functions/ReadFileTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,100 @@ class ReadFileTest {

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readExcelCsvFile() {
writeFile("simple_excel.csv", "title,category,price\nharry potter,book,7.99")

val args = listOf("\"${dirPath("simple_excel.csv")}\"", "{type:\"excel_csv\", header:true}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{title:\"harry potter\",category:\"book\",price:\"7.99\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readPostgreSQLCsvFile() {
writeFile("simple_postgresql.csv", "id,name,balance\n1,Bob,10000.00")

val args = listOf("\"${dirPath("simple_postgresql.csv")}\"", "{type:\"postgresql_csv\", header:true}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile1() { // delimiter
writeFile("customized.csv", "id name balance\n1 Bob 10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, delimiter:' '}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile2() { // ignore_empty_line
writeFile("customized.csv", "id,name,balance\n\n1,Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, ignore_empty_line: false}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"\"},{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile3() { // trim and ignore_surrounding_space
writeFile("customized.csv", "id,name,balance\n 1 , Bob , 10000.00 ")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, ignore_surrounding_space:false, trim:false}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\" 1 \",name:\" Bob \",balance:\" 10000.00 \"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile4() { // line_breaker
writeFile("customized.csv", "id,name,balance\r\n1,Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, line_breaker:'\\\r\\\n'}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile5() { // escape
writeFile("customized.csv", "id,name,balance\n\"/\"1\",Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, escape:'/'}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"\\\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile6() { // quote
writeFile("customized.csv", "id,name,balance\n'1,',Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, quote:\"'\"}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1,\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}
}
32 changes: 32 additions & 0 deletions docs/user/CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -641,3 +641,35 @@ Kumo dog
Mochi dog
Lilikoi unicorn
```

## Predefined CSV Data

The `read_file` function provides options to read other predefined CSV data formats.
For example, if a CSV file is exported from PostgreSQL, we can use the following command
to read the file:
```
read_file('simple_postgresql.csv', {'type':'postgresql_csv'})
```
Other available options for the argument `type` besides `postgresql_csv` are `excel_csv`, `mysql_csv`, `mongodb_csv`, `mongodb_tsv`, and `postgresql_text`.

## Customized CSV Data
The `read_file` function also provides options to read customized CSV data formats.
For example, we have a data file where the whitespace is the separator as shown below:
```
title category price
harry_potter book 7.99
dot electronics 49.99
echo electronics 99.99
```
We can use the following command to read the file:
```
read_file('customized.csv', {'type':'customized', 'delimiter':' ', 'header':true})
```
All the available options for customized CSV files are shown as following:
1. Ignore empty lines: `'ignore_empty_line':true`
2. Ignore spaces surrounding comma: `'ignore_surrounding_space':true`
3. Trim leading and trailing blanks: `'trim':true`
4. Set line breaker (only working with '\\r', '\\n' and '\\r\\n'): `'line_breaker: \n'`
5. Set escape sign (single character only): `'escape':'\'`
6. Set quote sign (single character only): `'quote':'"'`
7. Set delimiter sign (single character only): `'delimiter':','`
13 changes: 3 additions & 10 deletions lang/src/org/partiql/lang/eval/io/DelimitedValues.kt
Original file line number Diff line number Diff line change
Expand Up @@ -59,25 +59,18 @@ object DelimitedValues {
* Lazily loads a stream of values from a [Reader] into a sequence backed [ExprValue].
* This does **not** close the [Reader].
*
* @param ion The system to use.
* @param input The input source.
* @param delimiter The delimiter to use between columns.
* @param hasHeader Whether the first row of the delimited input defines the columns.
* @param csvFormat What the format of csv files is.
* @param conversionMode How column text should be converted.
*/
@JvmStatic
fun exprValue(valueFactory: ExprValueFactory,
input: Reader,
delimiter: Char,
hasHeader: Boolean,
csvFormat: CSVFormat,
conversionMode: ConversionMode): ExprValue {
val reader = BufferedReader(input)
val csvFormat = when (hasHeader){
true -> CSVFormat.DEFAULT.withDelimiter(delimiter).withFirstRecordAsHeader()
false -> CSVFormat.DEFAULT.withDelimiter(delimiter)
}
val csvParser = CSVParser(reader, csvFormat)
val columns: List<String> = csvParser.headerNames // `columns` is an empty list when `hasHeader` is false
val columns: List<String> = csvParser.headerNames

val seq = csvParser.asSequence().map { csvRecord ->
valueFactory.newStruct(
Expand Down
21 changes: 8 additions & 13 deletions lang/test/org/partiql/lang/eval/io/DelimitedValuesTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

package org.partiql.lang.eval.io

import org.apache.commons.csv.CSVFormat
import org.partiql.lang.*
import org.partiql.lang.eval.io.DelimitedValues.ConversionMode
import org.partiql.lang.eval.io.DelimitedValues.ConversionMode.*
Expand All @@ -32,10 +33,9 @@ class DelimitedValuesTest : TestBase() {
}

private fun read(text: String,
delimiter: Char,
hasHeader: Boolean,
csvFormat: CSVFormat,
conversionMode: ConversionMode): ExprValue =
DelimitedValues.exprValue(valueFactory, StringReader(text), delimiter, hasHeader, conversionMode)
DelimitedValues.exprValue(valueFactory, StringReader(text), csvFormat, conversionMode)

private fun assertWrite(expectedText: String,
valueText: String,
Expand Down Expand Up @@ -78,8 +78,7 @@ class DelimitedValuesTest : TestBase() {
"""[]""",
read(
"",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = NONE
)
)
Expand All @@ -89,8 +88,7 @@ class DelimitedValuesTest : TestBase() {
"""[]""",
read(
"",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = AUTO
)
)
Expand All @@ -100,8 +98,7 @@ class DelimitedValuesTest : TestBase() {
"""[{_1: "1", _2: "2", _3: "3"}]""",
read(
"""1,2,3""",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = NONE
)
)
Expand All @@ -119,8 +116,7 @@ class DelimitedValuesTest : TestBase() {
|1.0,2e0,2007-10-10T12:00:00Z
|hello,{,}
""".trimMargin(),
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = AUTO
)
)
Expand All @@ -139,8 +135,7 @@ class DelimitedValuesTest : TestBase() {
|1.0,2e0,2007-10-10T12:00:00Z
|hello,{,}
""".trimMargin(),
delimiter = ',',
hasHeader = true,
CSVFormat.DEFAULT.withFirstRecordAsHeader(),
conversionMode = AUTO
)
)
Expand Down

0 comments on commit 5044be1

Please sign in to comment.